Skip to content

Commit

Permalink
Division title importer/exporter (#1765)
Browse files Browse the repository at this point in the history
- Add python venv manager/requirements.txt
- Add a title_priority column to divisions
- Update existing data to be 'MANUAL' when it has a yes_text.
- Adds division_io.py
  - Export parquet dumps of divisions and votes
  - Ingest manual and automated updates of titles
from remote sources
- Update morningupdate with new ins and outs
 - IN - division titles from Parliament where they exist
 - OUT - a dump of the divisions table (for twfy-votes).
  • Loading branch information
ajparsons authored Mar 21, 2024
1 parent d1fe013 commit 9bc5f66
Show file tree
Hide file tree
Showing 8 changed files with 355 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ www/docs/docs-cache
www/docs/style/*.css
www/docs/style/stylesheets
.vagrant
.venv
build/
puphpet/files/dot/ssh
!puphpet/files/dot/ssh/insecure_private_key
Expand Down
6 changes: 6 additions & 0 deletions bin/deploy.bash
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ php composer.phar install --no-dev --optimize-autoloader
bundle install --deployment --binstubs "vendor/bundle-bin"
export PATH="$DIR/vendor/bundle-bin:$PATH"

# setup venv and python packages
python3 -m venv .venv
source .venv/bin/activate
pip install --upgrade pip
pip install -r requirements.txt

# Now use compass to compile the SCSS:
(cd "$DIR/www/docs/style" && bundle exec compass compile)

Expand Down
4 changes: 4 additions & 0 deletions conf/packages
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ php7-xapian
python3-lxml
python3-mysqldb
snarf
python3-dev
libmariadb-dev
python3-venv
pkg-config
8 changes: 8 additions & 0 deletions db/0023-add-division-title-priority.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ALTER TABLE `divisions`
ADD COLUMN title_priority ENUM('ORIGINAL_HEADER', 'PARLIAMENT_DESCRIBED', 'MANUAL') default 'ORIGINAL_HEADER';

UPDATE `divisions`
SET title_priority = CASE
WHEN yes_text IS NOT NULL AND yes_text <> '' THEN 'MANUAL'
ELSE title_priority
END;
1 change: 1 addition & 0 deletions db/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ CREATE TABLE `divisions` (
`both_total` int(3) default 0,
`majority_vote` enum('aye', 'no', '') default '',
`lastupdate` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
`title_priority` int(3) default 1,
UNIQUE KEY `division_id` (`division_id`),
KEY `gid` (`gid`)
);
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
rich-click==1.7.4
pandas==2.2.1
pyarrow==15.0.2
mysqlclient==2.2.4
322 changes: 322 additions & 0 deletions scripts/division_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
#!/usr/bin/env python3
# encoding: utf-8
"""
division_io.py - Import and export division data from the TWFY database.
See python scripts/division_io.py --help for usage.
"""

import re
from enum import Enum
from pathlib import Path
from typing import cast
from warnings import filterwarnings

import MySQLdb
import pandas as pd
import rich_click as click
from rich import print
from rich.prompt import Prompt

# suppress warnings about using mysqldb in pandas
filterwarnings(
"ignore",
category=UserWarning,
message=".*pandas only supports SQLAlchemy connectable.*",
)


class TitlePriority(str, Enum):
ORIGINAL_HEADER = "ORIGINAL_HEADER"
PARLIAMENT_DESCRIBED = "PARLIAMENT_DESCRIBED"
MANUAL = "MANUAL"

@classmethod
def get_priority(cls, priority: str) -> int:

lookup = {
cls.ORIGINAL_HEADER: 1,
cls.PARLIAMENT_DESCRIBED: 5,
cls.MANUAL: 10,
}
return lookup[priority]


@click.group()
def cli():
pass


def fast_config(config_path: Path) -> dict[str, str]:
"""
There's a more comprehensive config parser in commonlib/pylib/mysociety/config.py
But this is all we need for this function.
"""
pattern = r"define\s*\(\s*['\"](.*?)['\"]\s*,\s*['\"]?(.*?)['\"]?\s*\);"

with config_path.open("r") as f:
content = f.read()

return {key: value for key, value in re.findall(pattern, content)}


def get_twfy_db_connection() -> MySQLdb.Connection:
config_path = Path("conf", "general")
config = fast_config(config_path)

db_connection = cast(
MySQLdb.Connection,
MySQLdb.connect(
host=config["OPTION_TWFY_DB_HOST"],
db=config["OPTION_TWFY_DB_NAME"],
user=config["OPTION_TWFY_DB_USER"],
passwd=config["OPTION_TWFY_DB_PASS"],
charset="utf8",
),
)
return db_connection


def df_to_db(df: pd.DataFrame, *, new_priority: TitlePriority, verbose: bool = False):
"""
Take a dataframe with:
- a divison_id column or a twfy_votes_url column.
- division_title column
- optionally, yes_text and no_text.
And load these descriptions into the divisions table.
"""

if "twfy_votes_url" in df.columns:
df["division_id"] = df["twfy_votes_url"].apply(twfy_votes_url_to_pw_id)

if "division_id" not in df.columns:
raise ValueError(
"Dataframe to uplaod needs division_id or twfy_votes_url column"
)

if "division_title" not in df.columns:
raise ValueError("Dataframe needs a division_title column")

optional_col_count = 0

if "yes_text" in df.columns:
optional_col_count += 1

if "no_text" in df.columns:
optional_col_count += 1

if optional_col_count == 1:
raise ValueError("Dataframe either needs both yes_text and no_text or neither.")

db_connection = get_twfy_db_connection()

# get all divisions with a title_priority below or equal to current priority
existing_df = pd.read_sql(
f"SELECT division_id, title_priority FROM divisions",
db_connection,
)
existing_df["int_title_priority"] = existing_df["title_priority"].apply(
TitlePriority.get_priority
)
existing_df = existing_df[
existing_df["int_title_priority"] <= TitlePriority.get_priority(new_priority)
]

# limit dataframe to just those that can be updated

count_before = len(df)
df = df[df["division_id"].isin(existing_df["division_id"])]
count_removed = count_before - len(df)
if verbose:
print(
f"[blue]Updating titles for {len(df)} divisions - ignoring {count_removed} due to priority or absence."
)
if optional_col_count == 0:
# update the division_title column in the database
update_command = "UPDATE divisions SET division_title = %s, title_priority = %s WHERE division_id = %s"

with db_connection.cursor() as cursor:
update_data = [
(row["division_title"], new_priority, row["division_id"])
for _, row in df.iterrows()
]
cursor.executemany(update_command, update_data)
db_connection.commit()
if verbose:
print(f"[green]{len(df)} rows updated.")
elif optional_col_count == 2:
# update the division_title, yes_text, and no_text columns in the database
update_command = "UPDATE divisions SET division_title = %s, title_priority = %s, yes_text = %s, no_text = %s WHERE division_id = %s"

with db_connection.cursor() as cursor:
update_data = [
(
row["division_title"],
new_priority,
row["yes_text"],
row["no_text"],
row["division_id"],
)
for _, row in df.iterrows()
]
cursor.executemany(update_command, update_data)
db_connection.commit()
if verbose:
print(f"[green]{len(df)} rows updated.")

db_connection.close()


def url_to_db(url: str, *, new_priority: TitlePriority):
"""
Pipe external URL into the update process.
"""

if url.endswith(".parquet"):
df = pd.read_parquet(url)
elif url.endswith(".csv"):
df = pd.read_csv(url)
elif url.endswith(".json"):
df = pd.read_json(url)
else:
raise ValueError("File not an allowed type (csv, json, or parquet)")

df_to_db(df, new_priority=new_priority)


def run_schema_update(verbose: bool = False):
"""
Check if the title_priority priority column exists in the divisions table
and if not, run db/0023-add-division-title-priority.sql
"""

db_connection = get_twfy_db_connection()
df = pd.read_sql("SELECT * FROM divisions limit 1", db_connection)
if "title_priority" in df.columns:
if verbose:
print("[blue]Schema already updated[/blue]")
return

update_command = Path("db", "0023-add-division-title-priority.sql").read_text()

# Execute the SQL update command
with db_connection.cursor() as cursor:
cursor.execute(update_command)
db_connection.commit()
if verbose:
print("[green]Schema updated successfully[/green]")
db_connection.close()


def twfy_votes_url_to_pw_id(url: str) -> str:
"""
Take a twfy_votes style URL (/decisions/division/commons/2024-01-10/37)
and put it in the pw id format
"""
parts = url.split("/")
division_no = parts[-1]
date = parts[-2]
chamber = parts[-3]
return f"pw-{date}-{division_no}-{chamber}"


@cli.command()
@click.option("--verbose", is_flag=True, help="Show verbose output")
def export_division_data(verbose: bool = False):
"""
Export division data to publically accessible parquet files
"""
dest_path = Path("data", "pwdata", "votes")
dest_path.mkdir(parents=True, exist_ok=True)

db_connection = get_twfy_db_connection()
# get divisions
df = pd.read_sql("SELECT * FROM divisions", db_connection)
df.to_parquet(dest_path / "divisions.parquet", index=False)
if verbose:
print(f"[green]Divisions written to {dest_path / 'divisions.parquet'}[/green]")
# get votes
df = pd.read_sql("SELECT * FROM persondivisionvotes", db_connection)
df.to_parquet(dest_path / "votes.parquet", index=False)
if verbose:
print(f"[green]Votes written to {dest_path / 'votes.parquet'}[/green]")

db_connection.close()


@cli.command()
@click.option("--verbose", is_flag=True, help="Show verbose output")
def update_schema(verbose: bool = False):
"""
Run schema update to support division titles if necessary
"""
run_schema_update(verbose=verbose)


@cli.command()
def update_division_title():
"""
Manually update a single division.
"""
item = {
"division_id": Prompt.ask("Division id (pw style)"),
"division_title": Prompt.ask("Division title"),
"yes_text": Prompt.ask(
"Division yes text (optional)", default="", show_default=False
),
"no_text": Prompt.ask(
"Division no text (optional)", default="", show_default=False
),
}

if item["yes_text"] == "":
del item["yes_text"]
if item["no_text"] == "":
del item["no_text"]
df = pd.DataFrame([item])

df_to_db(df, new_priority=TitlePriority.MANUAL)


@cli.command()
@click.option(
"--url",
prompt="Link to URL of manual update data",
help="A csv, json or parquet file to update division information from.",
)
@click.option("--verbose", is_flag=True, help="Show verbose output")
def update_from_url(url: str, verbose: bool = False):
"""
Fetch a bulk update file from a remote URL - assumes these are manual updates
and so take priority over other approaches.
"""
url_to_db(url, new_priority=TitlePriority.MANUAL, verbose=verbose)


@cli.command()
@click.option("--verbose", is_flag=True, help="Show verbose output")
def update_from_commons_votes(verbose: bool = False):
"""
Import Commons votes description names into local database.
Mid priority - higher than header based approaches, lower than manual approaches.
"""

df = pd.read_parquet(
"https://pages.mysociety.org/voting-data/data/commons_votes/latest/divisions.parquet"
)

df = df[["division_key", "title"]].rename(
columns={"division_key": "division_id", "title": "division_title"}
)

df_to_db(df, new_priority=TitlePriority.PARLIAMENT_DESCRIBED, verbose=verbose)


def main():
cli()


if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion scripts/morningupdate
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,15 @@ system "./xml2db.pl $cronquiet --recent --scotwrans --quiet";
system "./xml2db.pl $cronquiet --recent --scotqs --quiet";
system "./xml2db.pl $cronquiet --recent --standing --quiet";
system "./xml2db.pl $cronquiet --recent --lmqs --quiet";
system "python3 ./future-fetch.py";
system "../.venv/bin/python future-fetch.py";

# Update division names from Parliament
print "Updating division names\n" if $verbose;
system "../.venv/bin/python division_io.py update-from-commons-votes" . ($verbose ? " --verbose" : "");

# Export divisions to parquet
print "Exporting divisions to parquet\n" if $verbose;
system "../.venv/bin/python division_io.py export-division-data" . ($verbose ? " --verbose" : "");

$cronquiet = substr($cronquiet, 2) if $cronquiet;

Expand Down

0 comments on commit 9bc5f66

Please sign in to comment.