From a1d764d3ef17aa7aada8c7c155675c16ae63faab Mon Sep 17 00:00:00 2001 From: Andrew Bolster Date: Thu, 25 Apr 2024 12:56:21 +0100 Subject: [PATCH] Cineworld (#1151) * Cineworld Listings --- .gitignore | 5 + notebooks/CineworldCinemaListings.ipynb | 357 ++++++++++++++++++++++++ poetry.lock | 21 +- pyproject.toml | 1 + src/bolster/data_sources/cineworld.py | 60 ++++ src/bolster/utils/web.py | 34 ++- tests/test_eoni.py | 6 + 7 files changed, 482 insertions(+), 2 deletions(-) create mode 100644 notebooks/CineworldCinemaListings.ipynb create mode 100644 src/bolster/data_sources/cineworld.py diff --git a/.gitignore b/.gitignore index 761ae535..78f6c206 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,8 @@ ENV/ # http_cache http_cache.sqlite + +# JupyterNotebooks +.ipynb_checkpoints/ +.virtual_documents/ +.juypter diff --git a/notebooks/CineworldCinemaListings.ipynb b/notebooks/CineworldCinemaListings.ipynb new file mode 100644 index 00000000..2b6132f2 --- /dev/null +++ b/notebooks/CineworldCinemaListings.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1282777f-6bb2-4e3b-8756-a2c41ef36cf5", + "metadata": {}, + "source": [ + "# Cineworld Listings\n", + "\n", + "I like Weird movies coming to Imax, I don't like watching newspapers/etc to see Belfast announcing them. \n", + "\n", + "## Basic mode\n", + "\n", + "Yup, turns out cineworld just checks based on user-agent. And overall appears to be even more basic that [what this one looked like](https://github.com/oracal/cineworld)\n", + "\n", + "Easy observations:\n", + "\n", + "* Films are uniquely identified by `id`, which is also persisted in teh rest of the api calls (see `link` attribute)\n", + "* `117` is Belfasts Site code\n", + "* No clue what the 10108 is." + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "e703ee34-9023-4cda-bd47-fb0a1e37d6a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ho00011281',\n", + " 'name': 'Ghilli (Re-release) (Tamil)',\n", + " 'length': 164,\n", + " 'posterLink': 'https://regalcdn.azureedge.net/CW/GhilliRereleaseTamil/HO00011281/TV_SmallPosterImage/20240415-120139903.jpg',\n", + " 'videoLink': 'https://youtu.be/4aGEjyn-bPQ?si=CgXZSe1WH5Cc4292',\n", + " 'link': 'https://www.cineworld.co.uk/films/ghilli-re-release-tamil/ho00011281',\n", + " 'weight': 10,\n", + " 'releaseYear': '2024',\n", + " 'attributeIds': ['12a',\n", + " '2d',\n", + " 'action',\n", + " 'drama',\n", + " 'reserved-selected',\n", + " 'subbed']}" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "from datetime import date\n", + "\n", + "headers = {\n", + " \"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\",\n", + "}\n", + "\n", + "requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/117/at-date/{date.today().isoformat()}\", \n", + " headers=headers).json()['body']['films'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "3ba62447-de46-4d81-bf82-3d68724ec9b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Ghilli (Re-release) (Tamil)'" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import date\n", + "def get_cinema_listing_for(screening_date = None, site_code=117):\n", + " if screening_date is None:\n", + " screening_date = str(date.today())\n", + " elif isinstance(screening_date, date):\n", + " screening_date = str(screening_date)\n", + " else:\n", + " raise ValueError(\"Can only use date-type with screening_date\")\n", + " response = requests.get(f\"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}\", headers=headers)\n", + " response.raise_for_status()\n", + " return response.json()['body']['films']\n", + " \n", + "listings = get_cinema_listing_for()\n", + "listings[0]['name']" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "36cb9b38-bd63-47ba-96e6-8bebd5530071", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date'])" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "listings[0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "44eabb06-0609-431f-86a1-820f00380c54", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d082902f542a42b2bd60dbaa8c14088f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/28 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedate
2Civil War2024-04-15
15Civil War2024-04-16
27Civil War2024-04-17
42Civil War2024-04-18
54(IMAX) Hunger Games: Ballad Of Songbirds & Snakes2024-04-20
55(IMAX) Oppenheimer2024-04-20
56(IMAX) Spider-Man: Across The Spider-Verse2024-04-20
57(IMAX) The Super Mario Bros. Movie2024-04-20
67SPY x FAMILY CODE: White (Subtitled)2024-04-27
70SPY x FAMILY CODE: White (Subtitled)2024-04-28
73The Fall Guy2024-05-02
75The Fall Guy2024-05-03
77The Fall Guy2024-05-04
80The Fall Guy2024-05-05
81The Fall Guy2024-05-06
82Kingdom Of The Planet Of The Apes2024-05-09
84Kingdom Of The Planet Of The Apes2024-05-10
85Kingdom Of The Planet Of The Apes2024-05-11
86Kingdom Of The Planet Of The Apes2024-05-12
\n", + "" + ], + "text/plain": [ + " name date\n", + "2 Civil War 2024-04-15\n", + "15 Civil War 2024-04-16\n", + "27 Civil War 2024-04-17\n", + "42 Civil War 2024-04-18\n", + "54 (IMAX) Hunger Games: Ballad Of Songbirds & Snakes 2024-04-20\n", + "55 (IMAX) Oppenheimer 2024-04-20\n", + "56 (IMAX) Spider-Man: Across The Spider-Verse 2024-04-20\n", + "57 (IMAX) The Super Mario Bros. Movie 2024-04-20\n", + "67 SPY x FAMILY CODE: White (Subtitled) 2024-04-27\n", + "70 SPY x FAMILY CODE: White (Subtitled) 2024-04-28\n", + "73 The Fall Guy 2024-05-02\n", + "75 The Fall Guy 2024-05-03\n", + "77 The Fall Guy 2024-05-04\n", + "80 The Fall Guy 2024-05-05\n", + "81 The Fall Guy 2024-05-06\n", + "82 Kingdom Of The Planet Of The Apes 2024-05-09\n", + "84 Kingdom Of The Planet Of The Apes 2024-05-10\n", + "85 Kingdom Of The Planet Of The Apes 2024-05-11\n", + "86 Kingdom Of The Planet Of The Apes 2024-05-12" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['attributeIds'].apply(lambda l: 'imax' in l)][['name','date']]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/poetry.lock b/poetry.lock index dd0284ed..127a0162 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3371,6 +3371,25 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "waybackpy" +version = "3.0.6" +description = "Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily." +optional = false +python-versions = ">=3.6" +files = [ + {file = "waybackpy-3.0.6-py3-none-any.whl", hash = "sha256:c568b0db9056fbe42a1a7e56b4f1d1919bd3f76bd62da58d9ee2e577297be284"}, + {file = "waybackpy-3.0.6.tar.gz", hash = "sha256:497a371756aba7644eb7ada0ebd4edb15cb8c53bc134cc973bf023a12caff83f"}, +] + +[package.dependencies] +click = "*" +requests = "*" +urllib3 = "*" + +[package.extras] +dev = ["black", "codecov", "flake8", "mypy", "pytest", "pytest-cov", "setuptools (>=46.4.0)", "types-requests"] + [[package]] name = "wcwidth" version = "0.2.13" @@ -3444,4 +3463,4 @@ docs = ["Sphinx", "autoapi", "nbsphinx", "sphinx-autoapi", "sphinx-autodoc-typeh [metadata] lock-version = "2.0" python-versions = ">=3.8.0,<3.12" -content-hash = "07f2ace43e0593fa956a60823a9de181325222d4195b897b320fed252de2491a" +content-hash = "255b1c5841ae87d2a68e51ab50ff7bb69f48643e3ff57a7d787f12a943fc7c54" diff --git a/pyproject.toml b/pyproject.toml index dbaf8a16..757a7f96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ lxml = "^5.0.0" # urllib3 2.0 is not compatible with poetry-export-plugin # https://github.com/python-poetry/poetry-plugin-export/issues/239 urllib3 = ">=1.26,<2" +waybackpy = "^3.0.6" [tool.poetry.scripts] bolster = "bolster.cli:main" diff --git a/src/bolster/data_sources/cineworld.py b/src/bolster/data_sources/cineworld.py new file mode 100644 index 00000000..de0ee9c7 --- /dev/null +++ b/src/bolster/data_sources/cineworld.py @@ -0,0 +1,60 @@ +""" +This module provides functions to retrieve cinema listings from the Cineworld API. + +The main function in this module is `get_cinema_listings`, which takes a site code and a screening date as input and returns a dictionary containing the cinema listings for that date. + +Site Code 117 maps to Belfast, you're on your own for the rest. + +Example usage: + cinema_listings = get_cinema_listings(117) + list(cinema_listings[0].keys()) + # Output: ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code'] + +""" +from datetime import date + +from ..utils.web import session + + +def get_cinema_listings(site_code: int = 117, screening_date: date = date.today()): + """ + Get cinema listings from the Cineworld API. + + Args: + site_code (int): The site code of the cinema. Defaults to 117; Belfast + screening_date (date): The date for which to retrieve the listings. Defaults to today's date. + + Returns: + dict: A dictionary containing the cinema listings. + + Raises: + requests.exceptions.RequestException: If there was an error making the API request. + + >>> cinema_listings = get_cinema_listings(117) + >>> list(cinema_listings[0].keys()) + ['id', 'name', 'length', 'posterLink', 'videoLink', 'link', 'weight', 'releaseYear', 'attributeIds', 'date', 'site_code'] + + """ + if screening_date is None: + screening_date = date.today() + + if not isinstance(screening_date, date): + try: + screening_date = date.fromisoformat(screening_date) + except ValueError as e: + raise ValueError( + "screening_date must be a date object or a string in the format 'YYYY-MM-DD'" + ) from e + + url = f"https://www.cineworld.co.uk/uk/data-api-service/v1/quickbook/10108/film-events/in-cinema/{site_code}/at-date/{screening_date}" + + try: + response = session.get(url) + response.raise_for_status() + listings = response.json()["body"]["films"] + for list in listings: + list["date"] = screening_date + list["site_code"] = site_code + return listings + except requests.exceptions.RequestException as e: + raise e diff --git a/src/bolster/utils/web.py b/src/bolster/utils/web.py index c0d75d84..769a8a85 100644 --- a/src/bolster/utils/web.py +++ b/src/bolster/utils/web.py @@ -1,14 +1,46 @@ import io +import logging import zipfile from io import BytesIO import pandas as pd import requests +from waybackpy import exceptions +from waybackpy import WaybackMachineCDXServerAPI from . import version_no ua = f"@Bolster/{version_no} (+http://bolster.online/)" +session = requests.Session() +session.headers.update({"User-Agent": ua}) + + +def get_last_valid(url): + return WaybackMachineCDXServerAPI(url).oldest().archive_url + + +def resilient_get(url, **kwargs): + """ + Attempt a get, but if it fails, try using the wayback machine to get the last valid version and get that. + If all else fails, raise a HTTPError from the inner "NoCDXRecordFound" exception + """ + + try: + res = requests.get(url, **kwargs) + res.raise_for_status() + except requests.HTTPError as outer_err: + try: + last_valid = get_last_valid(url) + except exceptions.NoCDXRecordFound as inner_err: + raise outer_err from inner_err + res = requests.get(last_valid, **kwargs) + res.raise_for_status() + logging.warning( + f"Failed to get {url} directly, successfully used waybackmachine to get {last_valid}" + ) + return res + def get_excel_dataframe(file_url, requests_kwargs=None, read_kwargs=None): if requests_kwargs is None: @@ -28,7 +60,7 @@ def download_extract_zip(url): Download a ZIP file and extract its contents in memory yields (filename, file-like object) pairs """ - with requests.get(url, stream=True) as response: + with session.get(url, stream=True) as response: response.raise_for_status() with zipfile.ZipFile(io.BytesIO(response.content)) as thezip: for zipinfo in thezip.infolist(): diff --git a/tests/test_eoni.py b/tests/test_eoni.py index 2fb58a15..50131361 100644 --- a/tests/test_eoni.py +++ b/tests/test_eoni.py @@ -54,10 +54,16 @@ def test_2022_constituency_parsing(self): data = get_results(2022) self.assertSetEqual(set(data.keys()), constituencies_post_2003) + @unittest.skip( + reason="Not currently possible as 2017 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775" + ) def test_2017_constituency_parsing(self): data = get_results(2017) self.assertSetEqual(set(data.keys()), constituencies_post_2003) + @unittest.skip( + reason="Not currently possible as 2016 results were nuked by EONI https://twitter.com/Bolster/status/1783446858859241775" + ) def test_2016_constituency_parsing(self): data = get_results(2016) self.assertSetEqual(set(data.keys()), constituencies_post_2003)