From a1d764d3ef17aa7aada8c7c155675c16ae63faab Mon Sep 17 00:00:00 2001
From: Andrew Bolster <andrew.bolster@gmail.com>
Date: Thu, 25 Apr 2024 12:56:21 +0100
Subject: [PATCH] Cineworld (#1151)

* Cineworld Listings
---
 .gitignore                              |   5 +
 notebooks/CineworldCinemaListings.ipynb | 357 ++++++++++++++++++++++++
 poetry.lock                             |  21 +-
 pyproject.toml                          |   1 +
 src/bolster/data_sources/cineworld.py   |  60 ++++
 src/bolster/utils/web.py                |  34 ++-
 tests/test_eoni.py                      |   6 +
 7 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 notebooks/CineworldCinemaListings.ipynb
 create mode 100644 src/bolster/data_sources/cineworld.py

diff --git a/.gitignore b/.gitignore
index 761ae535..78f6c206 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,8 @@ ENV/
 
 # http_cache
 http_cache.sqlite
+
+# JupyterNotebooks
+.ipynb_checkpoints/
+.virtual_documents/
+.juypter
diff --git a/notebooks/CineworldCinemaListings.ipynb b/notebooks/CineworldCinemaListings.ipynb
new file mode 100644
index 00000000..2b6132f2
--- /dev/null
+++ b/notebooks/CineworldCinemaListings.ipynb
@@ -0,0 +1,357 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1282777f-6bb2-4e3b-8756-a2c41ef36cf5",
+   "metadata": {},
+   "source": [
+    "# Cineworld Listings\n",
+    "\n",
+    "I like Weird movies coming to Imax, I don't like watching newspapers/etc to see Belfast announcing them. \n",
+    "\n",
+    "## Basic mode\n",
+    "\n",
+    "Yup, turns out cineworld just checks based on user-agent. And overall appears to be even more basic that [what this one looked like](https://github.com/oracal/cineworld)\n",
+    "\n",
+    "Easy observations:\n",
+    "\n",
+    "* Films are uniquely identified by `id`, which is also persisted in teh rest of the api calls (see `link` attribute)\n",
+    "* `117` is Belfasts Site code\n",
+    "* No clue what the 10108 is."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "e703ee34-9023-4cda-bd47-fb0a1e37d6a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'ho00011281',\n",
+       " 'name': 'Ghilli (Re-release) (Tamil)',\n",
+       " 'length': 164,\n",
+       " 'posterLink': 'https://regalcdn.azureedge.net/CW/GhilliRereleaseTamil/HO00011281/TV_SmallPosterImage/20240415-120139903.jpg',\n",
+       " 'videoLink': 'https://youtu.be/4aGEjyn-bPQ?si=CgXZSe1WH5Cc4292',\n",
+       " 'link': 'https://www.cineworld.co.uk/films/ghilli-re-release-tamil/ho00011281',\n",
+       " 'weight': 10,\n",
+       " 'releaseYear': '2024',\n",
+       " 'attributeIds': ['12a',\n",
+       "  '2d',\n",
+       "  'action',\n",
+       "  'drama',\n",
+       "  'reserved-selected',\n",
+       "  'subbed']}"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "from datetime import date\n",
+    "\n",
+    "headers = {\n",
+    "    \"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\",\n",
+    "}\n",
+    "\n",
+    "requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/117/at-date/{date.today().isoformat()}\", \n",
+    "             headers=headers).json()['body']['films'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "3ba62447-de46-4d81-bf82-3d68724ec9b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ghilli (Re-release) (Tamil)'"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datetime import date\n",
+    "def get_cinema_listing_for(screening_date = None, site_code=117):\n",
+    "    if screening_date is None:\n",
+    "        screening_date = str(date.today())\n",
+    "    elif isinstance(screening_date, date):\n",
+    "        screening_date = str(screening_date)\n",
+    "    else:\n",
+    "        raise ValueError(\"Can only use date-type with screening_date\")\n",
+    "    response = requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}\", headers=headers)\n",
+    "    response.raise_for_status()\n",
+    "    return response.json()['body']['films']\n",
+    "    \n",
+    "listings = get_cinema_listing_for()\n",
+    "listings[0]['name']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "36cb9b38-bd63-47ba-96e6-8bebd5530071",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date'])"
+      ]
+     },
+     "execution_count": 96,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "listings[0].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "44eabb06-0609-431f-86a1-820f00380c54",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d082902f542a42b2bd60dbaa8c14088f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/28 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "listings = []\n",
+    "\n",
+    "for d in tqdm(pd.date_range('today', freq='D', periods=28)):\n",
+    "    _d = d.date()\n",
+    "    for listing in get_cinema_listing_for(_d):\n",
+    "        listing['date'] = _d\n",
+    "        listings.append(listing)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "639a199b-eed6-42a8-9783-a98d8256e1e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(listings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "638408b7-0a70-4e10-b523-facbc69f38ce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "d8b47fa9-80a6-426a-b3fc-2552be84a35f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Civil War</td>\n",
+       "      <td>2024-04-15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Civil War</td>\n",
+       "      <td>2024-04-16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Civil War</td>\n",
+       "      <td>2024-04-17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>Civil War</td>\n",
+       "      <td>2024-04-18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54</th>\n",
+       "      <td>(IMAX) Hunger Games: Ballad Of Songbirds &amp; Snakes</td>\n",
+       "      <td>2024-04-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>55</th>\n",
+       "      <td>(IMAX) Oppenheimer</td>\n",
+       "      <td>2024-04-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>56</th>\n",
+       "      <td>(IMAX) Spider-Man: Across The Spider-Verse</td>\n",
+       "      <td>2024-04-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>57</th>\n",
+       "      <td>(IMAX) The Super Mario Bros. Movie</td>\n",
+       "      <td>2024-04-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67</th>\n",
+       "      <td>SPY x FAMILY CODE: White (Subtitled)</td>\n",
+       "      <td>2024-04-27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70</th>\n",
+       "      <td>SPY x FAMILY CODE: White (Subtitled)</td>\n",
+       "      <td>2024-04-28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>73</th>\n",
+       "      <td>The Fall Guy</td>\n",
+       "      <td>2024-05-02</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75</th>\n",
+       "      <td>The Fall Guy</td>\n",
+       "      <td>2024-05-03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>77</th>\n",
+       "      <td>The Fall Guy</td>\n",
+       "      <td>2024-05-04</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80</th>\n",
+       "      <td>The Fall Guy</td>\n",
+       "      <td>2024-05-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>81</th>\n",
+       "      <td>The Fall Guy</td>\n",
+       "      <td>2024-05-06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>82</th>\n",
+       "      <td>Kingdom Of The Planet Of The Apes</td>\n",
+       "      <td>2024-05-09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>84</th>\n",
+       "      <td>Kingdom Of The Planet Of The Apes</td>\n",
+       "      <td>2024-05-10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>85</th>\n",
+       "      <td>Kingdom Of The Planet Of The Apes</td>\n",
+       "      <td>2024-05-11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>86</th>\n",
+       "      <td>Kingdom Of The Planet Of The Apes</td>\n",
+       "      <td>2024-05-12</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 name        date\n",
+       "2                                           Civil War  2024-04-15\n",
+       "15                                          Civil War  2024-04-16\n",
+       "27                                          Civil War  2024-04-17\n",
+       "42                                          Civil War  2024-04-18\n",
+       "54  (IMAX) Hunger Games: Ballad Of Songbirds & Snakes  2024-04-20\n",
+       "55                                 (IMAX) Oppenheimer  2024-04-20\n",
+       "56         (IMAX) Spider-Man: Across The Spider-Verse  2024-04-20\n",
+       "57                 (IMAX) The Super Mario Bros. Movie  2024-04-20\n",
+       "67               SPY x FAMILY CODE: White (Subtitled)  2024-04-27\n",
+       "70               SPY x FAMILY CODE: White (Subtitled)  2024-04-28\n",
+       "73                                       The Fall Guy  2024-05-02\n",
+       "75                                       The Fall Guy  2024-05-03\n",
+       "77                                       The Fall Guy  2024-05-04\n",
+       "80                                       The Fall Guy  2024-05-05\n",
+       "81                                       The Fall Guy  2024-05-06\n",
+       "82                  Kingdom Of The Planet Of The Apes  2024-05-09\n",
+       "84                  Kingdom Of The Planet Of The Apes  2024-05-10\n",
+       "85                  Kingdom Of The Planet Of The Apes  2024-05-11\n",
+       "86                  Kingdom Of The Planet Of The Apes  2024-05-12"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df['attributeIds'].apply(lambda l: 'imax' in l)][['name','date']]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/poetry.lock b/poetry.lock
index dd0284ed..127a0162 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3371,6 +3371,25 @@ files = [
 [package.extras]
 watchmedo = ["PyYAML (>=3.10)"]
 
+[[package]]
+name = "waybackpy"
+version = "3.0.6"
+description = "Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "waybackpy-3.0.6-py3-none-any.whl", hash = "sha256:c568b0db9056fbe42a1a7e56b4f1d1919bd3f76bd62da58d9ee2e577297be284"},
+    {file = "waybackpy-3.0.6.tar.gz", hash = "sha256:497a371756aba7644eb7ada0ebd4edb15cb8c53bc134cc973bf023a12caff83f"},
+]
+
+[package.dependencies]
+click = "*"
+requests = "*"
+urllib3 = "*"
+
+[package.extras]
+dev = ["black", "codecov", "flake8", "mypy", "pytest", "pytest-cov", "setuptools (>=46.4.0)", "types-requests"]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.13"
@@ -3444,4 +3463,4 @@ docs = ["Sphinx", "autoapi", "nbsphinx", "sphinx-autoapi", "sphinx-autodoc-typeh
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.0,<3.12"
-content-hash = "07f2ace43e0593fa956a60823a9de181325222d4195b897b320fed252de2491a"
+content-hash = "255b1c5841ae87d2a68e51ab50ff7bb69f48643e3ff57a7d787f12a943fc7c54"
diff --git a/pyproject.toml b/pyproject.toml
index dbaf8a16..757a7f96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ lxml = "^5.0.0"
 # urllib3 2.0 is not compatible with poetry-export-plugin
 # https://github.com/python-poetry/poetry-plugin-export/issues/239
 urllib3 = ">=1.26,<2"
+waybackpy = "^3.0.6"
 
 [tool.poetry.scripts]
 bolster = "bolster.cli:main"
diff --git a/src/bolster/data_sources/cineworld.py b/src/bolster/data_sources/cineworld.py
new file mode 100644
index 00000000..de0ee9c7
--- /dev/null
+++ b/src/bolster/data_sources/cineworld.py
@@ -0,0 +1,60 @@
+"""
+This module provides functions to retrieve cinema listings from the Cineworld API.
+
+The main function in this module is `get_cinema_listings`, which takes a site code and a screening date as input and returns a dictionary containing the cinema listings for that date.
+
+Site Code 117 maps to Belfast, you're on your own for the rest.
+
+Example usage:
+    cinema_listings = get_cinema_listings(117)
+    list(cinema_listings[0].keys())
+    # Output: ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code']
+
+"""
+from datetime import date
+
+from ..utils.web import session
+
+
+def get_cinema_listings(site_code: int = 117, screening_date: date = date.today()):
+    """
+    Get cinema listings from the Cineworld API.
+
+    Args:
+        site_code (int): The site code of the cinema. Defaults to 117; Belfast
+        screening_date (date): The date for which to retrieve the listings. Defaults to today's date.
+
+    Returns:
+        dict: A dictionary containing the cinema listings.
+
+    Raises:
+        requests.exceptions.RequestException: If there was an error making the API request.
+
+    >>> cinema_listings = get_cinema_listings(117)
+    >>> list(cinema_listings[0].keys())
+    ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code']
+
+    """
+    if screening_date is None:
+        screening_date = date.today()
+
+    if not isinstance(screening_date, date):
+        try:
+            screening_date = date.fromisoformat(screening_date)
+        except ValueError as e:
+            raise ValueError(
+                "screening_date must be a date object or a string in the format 'YYYY-MM-DD'"
+            ) from e
+
+    url = f"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}"
+
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        listings = response.json()["body"]["films"]
+        for list in listings:
+            list["date"] = screening_date
+            list["site_code"] = site_code
+        return listings
+    except requests.exceptions.RequestException as e:
+        raise e
diff --git a/src/bolster/utils/web.py b/src/bolster/utils/web.py
index c0d75d84..769a8a85 100644
--- a/src/bolster/utils/web.py
+++ b/src/bolster/utils/web.py
@@ -1,14 +1,46 @@
 import io
+import logging
 import zipfile
 from io import BytesIO
 
 import pandas as pd
 import requests
+from waybackpy import exceptions
+from waybackpy import WaybackMachineCDXServerAPI
 
 from . import version_no
 
 ua = f"@Bolster/{version_no} (+http://bolster.online/)"
 
+session = requests.Session()
+session.headers.update({"User-Agent": ua})
+
+
+def get_last_valid(url):
+    return WaybackMachineCDXServerAPI(url).oldest().archive_url
+
+
+def resilient_get(url, **kwargs):
+    """
+    Attempt a get, but if it fails, try using the wayback machine to get the last valid version and get that.
+    If all else fails, raise a HTTPError from the inner "NoCDXRecordFound" exception
+    """
+
+    try:
+        res = requests.get(url, **kwargs)
+        res.raise_for_status()
+    except requests.HTTPError as outer_err:
+        try:
+            last_valid = get_last_valid(url)
+        except exceptions.NoCDXRecordFound as inner_err:
+            raise outer_err from inner_err
+        res = requests.get(last_valid, **kwargs)
+        res.raise_for_status()
+        logging.warning(
+            f"Failed to get {url} directly, successfully used waybackmachine to get {last_valid}"
+        )
+    return res
+
 
 def get_excel_dataframe(file_url, requests_kwargs=None, read_kwargs=None):
     if requests_kwargs is None:
@@ -28,7 +60,7 @@ def download_extract_zip(url):
     Download a ZIP file and extract its contents in memory
     yields (filename, file-like object) pairs
     """
-    with requests.get(url, stream=True) as response:
+    with session.get(url, stream=True) as response:
         response.raise_for_status()
         with zipfile.ZipFile(io.BytesIO(response.content)) as thezip:
             for zipinfo in thezip.infolist():
diff --git a/tests/test_eoni.py b/tests/test_eoni.py
index 2fb58a15..50131361 100644
--- a/tests/test_eoni.py
+++ b/tests/test_eoni.py
@@ -54,10 +54,16 @@ def test_2022_constituency_parsing(self):
         data = get_results(2022)
         self.assertSetEqual(set(data.keys()), constituencies_post_2003)
 
+    @unittest.skip(
+        reason="Not currently possible as 2017 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775"
+    )
     def test_2017_constituency_parsing(self):
         data = get_results(2017)
         self.assertSetEqual(set(data.keys()), constituencies_post_2003)
 
+    @unittest.skip(
+        reason="Not currently possible as 2016 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775"
+    )
     def test_2016_constituency_parsing(self):
         data = get_results(2016)
         self.assertSetEqual(set(data.keys()), constituencies_post_2003)