From e4d96c125b572c45336c862ce3c0c8f392453a09 Mon Sep 17 00:00:00 2001 From: Justin Zhang Date: Thu, 7 Mar 2024 00:12:25 -0500 Subject: [PATCH] Penn Events Script Updates (#250) * Add self to penn events script * Add webdriver and use firefox for debugging * fix some date parsing + refactor * rate limit * Lint * uwsgi fix --------- Co-authored-by: vcai122 Co-authored-by: ashleyzhang01 --- backend/Pipfile | 3 +- backend/Pipfile.lock | 52 ++++++++--- .../commands/get_penn_today_events.py | 87 +++++++++++-------- 3 files changed, 92 insertions(+), 50 deletions(-) diff --git a/backend/Pipfile b/backend/Pipfile index 6f3e22ac..37d46dd3 100644 --- a/backend/Pipfile +++ b/backend/Pipfile @@ -28,7 +28,7 @@ django = "==5.0.2" django-cors-headers = "*" pyyaml = "*" uritemplate = "*" -uwsgi = {version = "*", markers = "sys_platform== 'linux'"} +uwsgi = "*" django-filter = "*" django-labs-accounts = "==0.9.5" django-debug-toolbar = "*" @@ -45,6 +45,7 @@ django-redis = "*" redis = "*" python-dateutil = "*" selenium = "*" +webdriver-manager = "*" [requires] python_version = "3.11" diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock index db745546..255d8290 100644 --- a/backend/Pipfile.lock +++ b/backend/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "17bd51c63ebeaf563ad6088a61a78499729fd6b82e7e083af0effb9b6303b5da" + "sha256": "00fcc1f1633885e1bfb66d00ecfde1bcd4398813869828461c35ed3b4706487b" }, "pipfile-spec": 6, "requires": { @@ -74,20 +74,20 @@ }, "boto3": { "hashes": [ - "sha256:8b3f5cc7fbedcbb22271c328039df8a6ab343001e746e0cdb24774c426cadcf8", - "sha256:f201b6a416f809283d554c652211eecec9fe3a52ed4063dab3f3e7aea7571d9c" + "sha256:c26c31ceeeb2bc5d2bb96ba0fdc9a04d7b10e6e0b081c55b9cea9069a0be04dd", + "sha256:f8046e3e2d1186a49b49f7464c4811c265c86001f404dd1a96c4365c773a4245" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==1.34.54" + "version": "==1.34.57" }, "botocore": { "hashes": [ - "sha256:4061ff4be3efcf53547ebadf2c94d419dfc8be7beec24e9fa1819599ffd936fa", - "sha256:bf215d93e9d5544c593962780d194e74c6ee40b883d0b885e62ef35fc0ec01e5" + "sha256:9a5aa2034de9f0c367b4b61a92af0fa827f5c21affa19e0a284838a142e71083", + "sha256:c8dafe0ad378a88bcf4153e6972870b03fb5aab406b694202307500709940baf" ], "markers": "python_version >= '3.8'", - "version": "==1.34.54" + "version": "==1.34.57" }, "celery": { "hashes": [ @@ -493,10 +493,11 @@ }, "jwcrypto": { "hashes": [ - "sha256:0815fbab613db99bad85691da5f136f8860423396667728a264bcfa6e1db36b0" + "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789", + "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039" ], "markers": "python_version >= '3.8'", - "version": "==1.5.4" + "version": "==1.5.6" }, "kombu": { "hashes": [ @@ -565,6 +566,14 @@ "markers": "python_version >= '3.7'", "version": "==1.3.0.post0" }, + "packaging": { + "hashes": [ + "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", + "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7" + ], + "markers": "python_version >= '3.7'", + "version": "==23.2" + }, "pandas": { "hashes": [ "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee", @@ -739,9 +748,17 @@ "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427" ], "index": "pypi", - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==2.9.0.post0" }, + "python-dotenv": { + "hashes": [ + "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", + "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a" + ], + "markers": "python_version >= '3.8'", + "version": "==1.0.1" + }, "pytz": { "hashes": [ "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812", @@ -862,7 +879,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==1.16.0" }, "sniffio": { @@ -949,7 +966,7 @@ "hashes": [ "sha256:77b6dd5cd633f4ae87ee393f7701f617736815499407376e78f3d16467523afe" ], - "markers": "sys_platform == 'linux'", + "index": "pypi", "version": "==2.0.24" }, "vine": { @@ -967,6 +984,15 @@ ], "version": "==0.2.13" }, + "webdriver-manager": { + "hashes": [ + "sha256:25ec177c6a2ce9c02fb8046f1b2732701a9418d6a977967bb065d840a3175d87", + "sha256:d7970052295bb9cda2c1a24cf0b872dd2c41ababcc78f7b6b8dc37a41e979a7e" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==4.0.1" + }, "webencodings": { "hashes": [ "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", @@ -1440,7 +1466,7 @@ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'", "version": "==0.10.2" }, "typed-ast": { diff --git a/backend/penndata/management/commands/get_penn_today_events.py b/backend/penndata/management/commands/get_penn_today_events.py index fe774ad4..b144b452 100644 --- a/backend/penndata/management/commands/get_penn_today_events.py +++ b/backend/penndata/management/commands/get_penn_today_events.py @@ -6,8 +6,11 @@ from django.utils import timezone from selenium import webdriver from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.service import Service as FirefoxService from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait +from webdriver_manager.firefox import GeckoDriverManager from penndata.models import Event @@ -26,20 +29,12 @@ def handle(self, *args, **kwargs): # past_events.delete() # Scrapes Penn Today - try: - driver = webdriver.Chrome() - - driver.get(PENN_TODAY_WEBSITE) - events_list = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.ID, "events-list")) + if not ( + soup := self.connect_and_parse_html( + PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list")) ) - - html_content = events_list.get_attribute("innerHTML") - driver.quit() - except ConnectionError: - return None - - soup = BeautifulSoup(html_content, "html.parser") + ): + return event_articles = soup.find_all("article", class_="tease") @@ -73,12 +68,16 @@ def handle(self, *args, **kwargs): if start_date.month < current_month: # If scraped month is before current month, increment year start_date = start_date.replace(year=current_year + 1) - if start_time_str == ALL_DAY: + print(start_date_str) + if ALL_DAY in start_time_str.lower(): start_time = datetime.time(0, 0) else: start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time() start_date = datetime.datetime.combine(start_date, start_time) + if start_date > now + datetime.timedelta(days=31): + continue + event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"]) end_time = self.get_end_time(event_url) @@ -95,19 +94,20 @@ def handle(self, *args, **kwargs): end_of_day = datetime.time(23, 59, 59) if end_date_elem: # end date but no end time end_date_str = end_date_elem.text.strip().split(" ")[-1] - end_date = datetime.combine( + end_date = datetime.datetime.combine( datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day ) + else: # no end date or end time - end_date = datetime.combine(start_date, end_of_day) + end_date = datetime.datetime.combine(start_date, end_of_day) Event.objects.update_or_create( name=name, defaults={ - "event_type": "", + "event_type": "Penn Today", "image_url": "", - "start": start_date, - "end": end_date, + "start": timezone.make_aware(start_date), + "end": timezone.make_aware(end_date), "location": location, "website": event_url, "description": description, @@ -115,27 +115,42 @@ def handle(self, *args, **kwargs): }, ) - self.stdout.write("Uploaded Events!") + self.stdout.write("Uploaded Penn Today Events!") + + def connect_and_parse_html(self, event_url, condition): + try: + options = Options() + options.add_argument("--headless") + driver = webdriver.Firefox( + service=FirefoxService(GeckoDriverManager().install()), options=options + ) + + driver.get(event_url) + print("WAITING FOR ELEMENT") + element = WebDriverWait(driver, 10).until(condition) + print("ELEMENT FOUND") + + html_content = element.get_attribute("innerHTML") + driver.quit() + return BeautifulSoup(html_content, "html.parser") + except ConnectionError: + print("Connection Error to webdriver") + return None - def get_end_time(event_url): - driver = webdriver.Chrome() - driver.get(event_url) - event_element = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")) + def get_end_time(self, event_url): + end_time_soup = self.connect_and_parse_html( + event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")) ) - end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser") end_time_range_str = ( end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "") ) - print(end_time_range_str) - if not end_time_range_str or ALL_DAY in end_time_range_str.lower(): - driver.quit() + + if ( + not end_time_range_str + or ALL_DAY in end_time_range_str.lower() + or len(times := end_time_range_str.split(" - ")) <= 1 + ): return None # No end time if the event is all day - times = end_time_range_str.split(" - ") - if len(times) <= 1: - driver.quit() - return None - end_time_str = times[1] - driver.quit() - return end_time_str + + return times[1]