mysociety · ajparsons · Apr 13, 2023 · Apr 13, 2023 · Apr 13, 2023 · Apr 13, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "parlparse",
+    "build":
+    {
+        "dockerfile": "../Dockerfile"
+    },
+    "workspaceFolder": "/workspaces/parlparse",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-vscode.test-adapter-converter",
+                "ms-azuretools.vscode-docker",
+                "bmewburn.vscode-intelephense-client",
+                "bungcip.better-toml",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "charliermarsh.ruff",
+                "mhutchie.git-graph"
+            ]
+        }
+    }
+}
diff --git a/.github/workflows/mirror.yml b/.github/workflows/mirror.yml
@@ -0,0 +1,25 @@
+name: Push mirror to git.mysociety.org
+
+on:
+  push:
+  workflow_dispatch:
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+
+    steps:
+
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: '0'
+
+    - name: Push branch to git.mysociety.org
+      id: push_to_mirror
+      uses: mysociety/[email protected]
+      with:
+        git_ssh_key: ${{ secrets.PUBLICCVS_GIT_KEY }}
+        ssh_known_hosts: ${{ secrets.GIT_KNOWN_HOSTS }}
+        tag: ${{ github.ref_name }} 
+        remote: 'ssh://[email protected]/data/git/public/parlparse.git'
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,44 @@
+{
+    "[python]": {
+        "editor.formatOnSave": true,
+        "editor.codeActionsOnSave": {
+            "source.fixAll.ruff": true,
+            "source.organizeImports.ruff": true
+        }
+    },
+    "python.linting.pylintEnabled": false,
+    "python.defaultInterpreterPath": "/usr/bin/python",
+    "python.terminal.activateEnvironment": false,
+    "python.formatting.provider": "black",
+    "python.analysis.typeCheckingMode": "basic",
+    "editor.formatOnSave": true,
+    "files.exclude": {
+        "**/.git": true,
+        "**/.svn": true,
+        "**/.hg": true,
+        "**/CVS": true,
+        "**/.DS_Store": true,
+        "**/*.pyc": {
+            "when": "$(basename).py"
+        },
+        "**/__pycache__": true
+    },
+    "files.associations": {
+        "**/*.html": "html",
+        "**/templates/**/*.html": "django-html",
+        "**/templates/**/*": "django-txt",
+        "**/requirements{/**,*}.{txt,in}": "pip-requirements"
+    },
+    "[markdown]": {
+        "editor.quickSuggestions": {
+            "comments": "on",
+            "strings": "on",
+            "other": "on"
+        }
+    },
+    "python.testing.pytestArgs": [
+        "tests/"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,8 @@
+FROM mysocietyorg/debian:buster
+RUN apt-get update && \
+    apt-get install python3-distutils python3-pip libxml2-dev libxslt-dev python-dev libffi-dev -y && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
+    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
+    pip install --upgrade pip
+COPY requirements.txt requirements.dev.txt /tmp/
+RUN pip install -r /tmp/requirements.txt -r /tmp/requirements.dev.txt
diff --git a/london-mayors-questions/.gitignore b/london-mayors-questions/.gitignore
@@ -1 +1,2 @@
 cache.sqlite
+json_cache
diff --git a/london-mayors-questions/README.md b/london-mayors-questions/README.md
@@ -1,15 +1,17 @@
 # London Mayor's Questions
 
-## How it works
-
 You can add `--help` to any command for a full list of options.
 
-### Scraping/Parsing Meetings
+The scraper stores working files in a git-ignored `json_cache` directory. 
 
-The `./questions.py meetings` command scrapes (by default) any unscraped dates between the `default_start_date` in `config.json` and yesterday.
+- `fetch-unknown-questions` - which accepts custom start and end dates. The `--last-week` flag goes back 7 days. This updates `json_cache/ids.json` with the current known set of question ids.
+- `fetch-unstored`, any questions we haven't previously downloaded will be fetched.
+- `refresh-unanswered`, any without answers (or with holding answers only) will be downloaded again.
+- `build-xml` accepts an `--outdir` argument and will convert the stored questions into xml files for import. The date of the file each question is put in based on the date of the answer - so the question is only moved to the XML when it is answered. 
 
-If a page returns a 404, it's assumed there is no meeting and this date is flagged in `state.json` to not be scraped again.
+So the final command to call looks something like this:
 
-If the page returns content, it's parsed to extract sessions. Each session is then scraped, and parsed for questions. New question IDs are added to the `state.json` file
+```bash
+./questions.py fetch-unknown-questions --last-week fetch-unstored refresh-unanswered build-xml --outdir temp/
+```
 
-### Scraping/Parsing Questions
diff --git a/london-mayors-questions/config.json b/london-mayors-questions/config.json
@@ -3,7 +3,21 @@
   "xml_file_prefix": "lmqs",
   "assembly_domain": "https://www.london.gov.uk",
   "public_whip_question_id_prefix": "uk.org.publicwhip/london-mayors-questions/",
-  "current_mayor_name": "Sadiq Khan",
+  "office_map": {
+    "Mayor of London": "Mayor of London",
+    "The Mayor": "Mayor of London",
+    "London Fire Commissioner": "London Fire Commissioner",
+    "Commissioner of London Fire Brigade": "London Fire Commissioner",
+    "Deputy Mayor, Fire and Resilience": "Deputy Mayor for Fire and Resilience",
+    "Deputy Mayor, Children and Families": "Deputy Mayor for Children and Young People",
+    "Deputy Mayor, Planning, Regeneration and Skills": "Deputy Mayor for Planning, Regeneration and Skills",
+    "Deputy Mayor, Culture and the Creative Industries": "Deputy Mayor for Culture and Creative Industries",
+    "Deputy Mayor, Communities and Social Justice": "Deputy Mayor for Communities and Social Justice",
+    "Deputy Mayor, Business": "Deputy Mayor for Business",
+    "Deputy Mayor, Environment and Energy": "Deputy Mayor for Environment and Energy",
+    "Deputy Mayor, Policing and Crime": "Deputy Mayor for Policing and Crime",
+    "Deputy Mayor, Housing and Residential Development": "Deputy Mayor for Housing and Residential Development"
+  },
   "name_regex_to_strip": [
     " AM$",
     " [CMO]BE$",
@@ -12,7 +26,7 @@
     " \\(Deputy Chair\\)$",
     " \\(Deputy Chairman\\)$",
     " \\(Deputy Chairman in the Chair\\)$",
-    " \\(Mayor of London\\)$",
+    " \\(Deputy Chair in the Chair\\)$",
     "^Dr ",
     " MP$"
   ],
@@ -23,4 +37,4 @@
     "Steve O\u2019Connell": "Steve O'Connell",
     "Sian Berry": "Siân Berry"
   }
-}
+}
diff --git a/london-mayors-questions/__init__.py → ...tions/london_mayors_questions/__init__.py b/london-mayors-questions/__init__.py → ...tions/london_mayors_questions/__init__.py
diff --git a/london-mayors-questions/london_mayors_questions/__main__.py b/london-mayors-questions/london_mayors_questions/__main__.py
@@ -0,0 +1,117 @@
+import datetime
+from pathlib import Path
+from typing import Optional
+
+import rich_click as click
+
+from .models import QuestionCollection
+
+CLI_DATETIME_FORMAT = click.DateTime(formats=("%Y-%m-%d",))
+
+
+@click.group(chain=True)
+def cli():
+    pass
+
+
+@cli.command()
+@click.option(
+    "-s",
+    "--start",
+    type=CLI_DATETIME_FORMAT,
+    help="The first date of the range to be scraped. (defaults to config's start date)",
+)
+@click.option(
+    "-e",
+    "--end",
+    type=CLI_DATETIME_FORMAT,
+    help="The last date of the range to be scraped. (defaults to today)",
+)
+@click.option("--last-week", is_flag=True, help="Scrape the last week")
+@click.pass_context
+def fetch_unknown_questions(
+    context: click.Context,
+    start: Optional[datetime.datetime] = None,
+    end: Optional[datetime.datetime] = None,
+    last_week: bool = False,
+):
+    """
+    Update our list of known ids
+    """
+    qc = QuestionCollection()
+    if last_week:
+        start = datetime.datetime.now() - datetime.timedelta(days=7)
+        end = datetime.datetime.now()
+    qc.get_ids_for_date_range(start, end)
+    qc.fetch_unstored_questions()
+
+
+@cli.command()
+@click.pass_context
+def refresh_unanswered(context: click.Context):
+    """
+    Fetch all questions that have not been answered
+    """
+    QuestionCollection().get_unanswered_questions()
+
+
+@cli.command()
+@click.pass_context
+def fetch_unstored(context: click.Context):
+    """
+    Fetch all questions that are known about but not downloaded.
+    """
+    QuestionCollection().fetch_unstored_questions()
+
+
+@cli.command()
+@click.option(
+    "-o",
+    "--outdir",
+    type=click.Path(exists=False, file_okay=False, dir_okay=True),
+    help="The directory to save the XMLs file to.",
+    default=".",
+)
+@click.option(
+    "-s",
+    "--start",
+    type=CLI_DATETIME_FORMAT,
+    help="The first date of the range to be scrape.",
+)
+@click.option(
+    "-e",
+    "--end",
+    type=CLI_DATETIME_FORMAT,
+    help="The last date of the range to be scraped.",
+)
+@click.pass_context
+def build_xml(
+    context: click.Context,
+    outdir: str,
+    start: Optional[datetime.datetime] = None,
+    end: Optional[datetime.datetime] = None,
+):
+    """
+    Build the XML file
+    """
+    path = Path(outdir)
+
+    if path.exists() is False:
+        # make dir and parents
+        path.mkdir(parents=True)
+    if path.is_dir() is False:
+        raise click.ClickException("outdir must be a directory")
+    QuestionCollection().export_answers_to_xml(path, start, end)
+
+
+@cli.command()
+@click.pass_context
+def unanswered_count(context: click.Context):
+    """
+    Get the number of unanswered questions
+    """
+    QuestionCollection().get_unanswered_count()
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/london-mayors-questions/london_mayors_questions/config.py b/london-mayors-questions/london_mayors_questions/config.py
@@ -0,0 +1,33 @@
+"""
+Lightweight configuration file loader.
+"""
+
+import json
+import sys
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict
+
+# if you ares using Python 3.8 or later, you can use the built-in TypedDict
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+
+class ConfigDict(TypedDict):
+    assembly_domain: str
+    default_start_date: str
+    public_whip_question_id_prefix: str
+    office_map: Dict[str, str]
+    name_regex_to_strip: str
+    name_corrections: Dict[str, str]
+    xml_file_prefix: str
+
+
+@lru_cache(maxsize=None)
+def get_config() -> ConfigDict:
+    # Load and parsethe configuration file
+    config_path = Path(__file__).parent.parent / "config.json"
+    with config_path.open() as config_json_file:
+        return json.load(config_json_file)