diff --git a/.gitignore b/.gitignore
index 761ae535..78f6c206 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,8 @@ ENV/
# http_cache
http_cache.sqlite
+
+# JupyterNotebooks
+.ipynb_checkpoints/
+.virtual_documents/
+.juypter
diff --git a/notebooks/CineworldCinemaListings.ipynb b/notebooks/CineworldCinemaListings.ipynb
new file mode 100644
index 00000000..2b6132f2
--- /dev/null
+++ b/notebooks/CineworldCinemaListings.ipynb
@@ -0,0 +1,357 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "1282777f-6bb2-4e3b-8756-a2c41ef36cf5",
+ "metadata": {},
+ "source": [
+ "# Cineworld Listings\n",
+ "\n",
+ "I like Weird movies coming to Imax, I don't like watching newspapers/etc to see Belfast announcing them. \n",
+ "\n",
+ "## Basic mode\n",
+ "\n",
+ "Yup, turns out cineworld just checks based on user-agent. And overall appears to be even more basic that [what this one looked like](https://github.com/oracal/cineworld)\n",
+ "\n",
+ "Easy observations:\n",
+ "\n",
+ "* Films are uniquely identified by `id`, which is also persisted in teh rest of the api calls (see `link` attribute)\n",
+ "* `117` is Belfasts Site code\n",
+ "* No clue what the 10108 is."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "e703ee34-9023-4cda-bd47-fb0a1e37d6a7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'id': 'ho00011281',\n",
+ " 'name': 'Ghilli (Re-release) (Tamil)',\n",
+ " 'length': 164,\n",
+ " 'posterLink': 'https://regalcdn.azureedge.net/CW/GhilliRereleaseTamil/HO00011281/TV_SmallPosterImage/20240415-120139903.jpg',\n",
+ " 'videoLink': 'https://youtu.be/4aGEjyn-bPQ?si=CgXZSe1WH5Cc4292',\n",
+ " 'link': 'https://www.cineworld.co.uk/films/ghilli-re-release-tamil/ho00011281',\n",
+ " 'weight': 10,\n",
+ " 'releaseYear': '2024',\n",
+ " 'attributeIds': ['12a',\n",
+ " '2d',\n",
+ " 'action',\n",
+ " 'drama',\n",
+ " 'reserved-selected',\n",
+ " 'subbed']}"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import requests\n",
+ "from datetime import date\n",
+ "\n",
+ "headers = {\n",
+ " \"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\",\n",
+ "}\n",
+ "\n",
+ "requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/117/at-date/{date.today().isoformat()}\", \n",
+ " headers=headers).json()['body']['films'][0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "id": "3ba62447-de46-4d81-bf82-3d68724ec9b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Ghilli (Re-release) (Tamil)'"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from datetime import date\n",
+ "def get_cinema_listing_for(screening_date = None, site_code=117):\n",
+ " if screening_date is None:\n",
+ " screening_date = str(date.today())\n",
+ " elif isinstance(screening_date, date):\n",
+ " screening_date = str(screening_date)\n",
+ " else:\n",
+ " raise ValueError(\"Can only use date-type with screening_date\")\n",
+ " response = requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}\", headers=headers)\n",
+ " response.raise_for_status()\n",
+ " return response.json()['body']['films']\n",
+ " \n",
+ "listings = get_cinema_listing_for()\n",
+ "listings[0]['name']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "36cb9b38-bd63-47ba-96e6-8bebd5530071",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date'])"
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "listings[0].keys()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "id": "44eabb06-0609-431f-86a1-820f00380c54",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d082902f542a42b2bd60dbaa8c14088f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/28 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from tqdm.auto import tqdm\n",
+ "\n",
+ "listings = []\n",
+ "\n",
+ "for d in tqdm(pd.date_range('today', freq='D', periods=28)):\n",
+ " _d = d.date()\n",
+ " for listing in get_cinema_listing_for(_d):\n",
+ " listing['date'] = _d\n",
+ " listings.append(listing) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "id": "639a199b-eed6-42a8-9783-a98d8256e1e4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame(listings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "638408b7-0a70-4e10-b523-facbc69f38ce",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "d8b47fa9-80a6-426a-b3fc-2552be84a35f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " Civil War | \n",
+ " 2024-04-15 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Civil War | \n",
+ " 2024-04-16 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Civil War | \n",
+ " 2024-04-17 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " Civil War | \n",
+ " 2024-04-18 | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " (IMAX) Hunger Games: Ballad Of Songbirds & Snakes | \n",
+ " 2024-04-20 | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " (IMAX) Oppenheimer | \n",
+ " 2024-04-20 | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " (IMAX) Spider-Man: Across The Spider-Verse | \n",
+ " 2024-04-20 | \n",
+ "
\n",
+ " \n",
+ " 57 | \n",
+ " (IMAX) The Super Mario Bros. Movie | \n",
+ " 2024-04-20 | \n",
+ "
\n",
+ " \n",
+ " 67 | \n",
+ " SPY x FAMILY CODE: White (Subtitled) | \n",
+ " 2024-04-27 | \n",
+ "
\n",
+ " \n",
+ " 70 | \n",
+ " SPY x FAMILY CODE: White (Subtitled) | \n",
+ " 2024-04-28 | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " The Fall Guy | \n",
+ " 2024-05-02 | \n",
+ "
\n",
+ " \n",
+ " 75 | \n",
+ " The Fall Guy | \n",
+ " 2024-05-03 | \n",
+ "
\n",
+ " \n",
+ " 77 | \n",
+ " The Fall Guy | \n",
+ " 2024-05-04 | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " The Fall Guy | \n",
+ " 2024-05-05 | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " The Fall Guy | \n",
+ " 2024-05-06 | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " Kingdom Of The Planet Of The Apes | \n",
+ " 2024-05-09 | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " Kingdom Of The Planet Of The Apes | \n",
+ " 2024-05-10 | \n",
+ "
\n",
+ " \n",
+ " 85 | \n",
+ " Kingdom Of The Planet Of The Apes | \n",
+ " 2024-05-11 | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " Kingdom Of The Planet Of The Apes | \n",
+ " 2024-05-12 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name date\n",
+ "2 Civil War 2024-04-15\n",
+ "15 Civil War 2024-04-16\n",
+ "27 Civil War 2024-04-17\n",
+ "42 Civil War 2024-04-18\n",
+ "54 (IMAX) Hunger Games: Ballad Of Songbirds & Snakes 2024-04-20\n",
+ "55 (IMAX) Oppenheimer 2024-04-20\n",
+ "56 (IMAX) Spider-Man: Across The Spider-Verse 2024-04-20\n",
+ "57 (IMAX) The Super Mario Bros. Movie 2024-04-20\n",
+ "67 SPY x FAMILY CODE: White (Subtitled) 2024-04-27\n",
+ "70 SPY x FAMILY CODE: White (Subtitled) 2024-04-28\n",
+ "73 The Fall Guy 2024-05-02\n",
+ "75 The Fall Guy 2024-05-03\n",
+ "77 The Fall Guy 2024-05-04\n",
+ "80 The Fall Guy 2024-05-05\n",
+ "81 The Fall Guy 2024-05-06\n",
+ "82 Kingdom Of The Planet Of The Apes 2024-05-09\n",
+ "84 Kingdom Of The Planet Of The Apes 2024-05-10\n",
+ "85 Kingdom Of The Planet Of The Apes 2024-05-11\n",
+ "86 Kingdom Of The Planet Of The Apes 2024-05-12"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df['attributeIds'].apply(lambda l: 'imax' in l)][['name','date']]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/poetry.lock b/poetry.lock
index dd0284ed..127a0162 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3371,6 +3371,25 @@ files = [
[package.extras]
watchmedo = ["PyYAML (>=3.10)"]
+[[package]]
+name = "waybackpy"
+version = "3.0.6"
+description = "Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily."
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "waybackpy-3.0.6-py3-none-any.whl", hash = "sha256:c568b0db9056fbe42a1a7e56b4f1d1919bd3f76bd62da58d9ee2e577297be284"},
+ {file = "waybackpy-3.0.6.tar.gz", hash = "sha256:497a371756aba7644eb7ada0ebd4edb15cb8c53bc134cc973bf023a12caff83f"},
+]
+
+[package.dependencies]
+click = "*"
+requests = "*"
+urllib3 = "*"
+
+[package.extras]
+dev = ["black", "codecov", "flake8", "mypy", "pytest", "pytest-cov", "setuptools (>=46.4.0)", "types-requests"]
+
[[package]]
name = "wcwidth"
version = "0.2.13"
@@ -3444,4 +3463,4 @@ docs = ["Sphinx", "autoapi", "nbsphinx", "sphinx-autoapi", "sphinx-autodoc-typeh
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.0,<3.12"
-content-hash = "07f2ace43e0593fa956a60823a9de181325222d4195b897b320fed252de2491a"
+content-hash = "255b1c5841ae87d2a68e51ab50ff7bb69f48643e3ff57a7d787f12a943fc7c54"
diff --git a/pyproject.toml b/pyproject.toml
index dbaf8a16..757a7f96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ lxml = "^5.0.0"
# urllib3 2.0 is not compatible with poetry-export-plugin
# https://github.com/python-poetry/poetry-plugin-export/issues/239
urllib3 = ">=1.26,<2"
+waybackpy = "^3.0.6"
[tool.poetry.scripts]
bolster = "bolster.cli:main"
diff --git a/src/bolster/data_sources/cineworld.py b/src/bolster/data_sources/cineworld.py
new file mode 100644
index 00000000..de0ee9c7
--- /dev/null
+++ b/src/bolster/data_sources/cineworld.py
@@ -0,0 +1,60 @@
+"""
+This module provides functions to retrieve cinema listings from the Cineworld API.
+
+The main function in this module is `get_cinema_listings`, which takes a site code and a screening date as input and returns a dictionary containing the cinema listings for that date.
+
+Site Code 117 maps to Belfast, you're on your own for the rest.
+
+Example usage:
+ cinema_listings = get_cinema_listings(117)
+ list(cinema_listings[0].keys())
+ # Output: ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code']
+
+"""
+from datetime import date
+
+from ..utils.web import session
+
+
+def get_cinema_listings(site_code: int = 117, screening_date: date = date.today()):
+ """
+ Get cinema listings from the Cineworld API.
+
+ Args:
+ site_code (int): The site code of the cinema. Defaults to 117; Belfast
+ screening_date (date): The date for which to retrieve the listings. Defaults to today's date.
+
+ Returns:
+ dict: A dictionary containing the cinema listings.
+
+ Raises:
+ requests.exceptions.RequestException: If there was an error making the API request.
+
+ >>> cinema_listings = get_cinema_listings(117)
+ >>> list(cinema_listings[0].keys())
+ ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code']
+
+ """
+ if screening_date is None:
+ screening_date = date.today()
+
+ if not isinstance(screening_date, date):
+ try:
+ screening_date = date.fromisoformat(screening_date)
+ except ValueError as e:
+ raise ValueError(
+ "screening_date must be a date object or a string in the format 'YYYY-MM-DD'"
+ ) from e
+
+ url = f"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}"
+
+ try:
+ response = session.get(url)
+ response.raise_for_status()
+ listings = response.json()["body"]["films"]
+ for list in listings:
+ list["date"] = screening_date
+ list["site_code"] = site_code
+ return listings
+ except requests.exceptions.RequestException as e:
+ raise e
diff --git a/src/bolster/utils/web.py b/src/bolster/utils/web.py
index c0d75d84..769a8a85 100644
--- a/src/bolster/utils/web.py
+++ b/src/bolster/utils/web.py
@@ -1,14 +1,46 @@
import io
+import logging
import zipfile
from io import BytesIO
import pandas as pd
import requests
+from waybackpy import exceptions
+from waybackpy import WaybackMachineCDXServerAPI
from . import version_no
ua = f"@Bolster/{version_no} (+http://bolster.online/)"
+session = requests.Session()
+session.headers.update({"User-Agent": ua})
+
+
+def get_last_valid(url):
+ return WaybackMachineCDXServerAPI(url).oldest().archive_url
+
+
+def resilient_get(url, **kwargs):
+ """
+ Attempt a get, but if it fails, try using the wayback machine to get the last valid version and get that.
+ If all else fails, raise a HTTPError from the inner "NoCDXRecordFound" exception
+ """
+
+ try:
+ res = requests.get(url, **kwargs)
+ res.raise_for_status()
+ except requests.HTTPError as outer_err:
+ try:
+ last_valid = get_last_valid(url)
+ except exceptions.NoCDXRecordFound as inner_err:
+ raise outer_err from inner_err
+ res = requests.get(last_valid, **kwargs)
+ res.raise_for_status()
+ logging.warning(
+ f"Failed to get {url} directly, successfully used waybackmachine to get {last_valid}"
+ )
+ return res
+
def get_excel_dataframe(file_url, requests_kwargs=None, read_kwargs=None):
if requests_kwargs is None:
@@ -28,7 +60,7 @@ def download_extract_zip(url):
Download a ZIP file and extract its contents in memory
yields (filename, file-like object) pairs
"""
- with requests.get(url, stream=True) as response:
+ with session.get(url, stream=True) as response:
response.raise_for_status()
with zipfile.ZipFile(io.BytesIO(response.content)) as thezip:
for zipinfo in thezip.infolist():
diff --git a/tests/test_eoni.py b/tests/test_eoni.py
index 2fb58a15..50131361 100644
--- a/tests/test_eoni.py
+++ b/tests/test_eoni.py
@@ -54,10 +54,16 @@ def test_2022_constituency_parsing(self):
data = get_results(2022)
self.assertSetEqual(set(data.keys()), constituencies_post_2003)
+ @unittest.skip(
+ reason="Not currently possible as 2017 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775"
+ )
def test_2017_constituency_parsing(self):
data = get_results(2017)
self.assertSetEqual(set(data.keys()), constituencies_post_2003)
+ @unittest.skip(
+ reason="Not currently possible as 2016 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775"
+ )
def test_2016_constituency_parsing(self):
data = get_results(2016)
self.assertSetEqual(set(data.keys()), constituencies_post_2003)