Skip to content

Commit

Permalink
Penn Events Script Updates (#250)
Browse files Browse the repository at this point in the history
* Add self to penn events script

* Add webdriver and use firefox for debugging

* fix some date parsing + refactor

* rate limit

* Lint

* uwsgi fix

---------

Co-authored-by: vcai122 <[email protected]>
Co-authored-by: ashleyzhang01 <[email protected]>
  • Loading branch information
3 people authored Mar 7, 2024
1 parent 693f353 commit e4d96c1
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 50 deletions.
3 changes: 2 additions & 1 deletion backend/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ django = "==5.0.2"
django-cors-headers = "*"
pyyaml = "*"
uritemplate = "*"
uwsgi = {version = "*", markers = "sys_platform== 'linux'"}
uwsgi = "*"
django-filter = "*"
django-labs-accounts = "==0.9.5"
django-debug-toolbar = "*"
Expand All @@ -45,6 +45,7 @@ django-redis = "*"
redis = "*"
python-dateutil = "*"
selenium = "*"
webdriver-manager = "*"

[requires]
python_version = "3.11"
52 changes: 39 additions & 13 deletions backend/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 51 additions & 36 deletions backend/penndata/management/commands/get_penn_today_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
from django.utils import timezone
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.firefox import GeckoDriverManager

from penndata.models import Event

Expand All @@ -26,20 +29,12 @@ def handle(self, *args, **kwargs):
# past_events.delete()

# Scrapes Penn Today
try:
driver = webdriver.Chrome()

driver.get(PENN_TODAY_WEBSITE)
events_list = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "events-list"))
if not (
soup := self.connect_and_parse_html(
PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list"))
)

html_content = events_list.get_attribute("innerHTML")
driver.quit()
except ConnectionError:
return None

soup = BeautifulSoup(html_content, "html.parser")
):
return

event_articles = soup.find_all("article", class_="tease")

Expand Down Expand Up @@ -73,12 +68,16 @@ def handle(self, *args, **kwargs):
if start_date.month < current_month:
# If scraped month is before current month, increment year
start_date = start_date.replace(year=current_year + 1)
if start_time_str == ALL_DAY:
print(start_date_str)
if ALL_DAY in start_time_str.lower():
start_time = datetime.time(0, 0)
else:
start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time()
start_date = datetime.datetime.combine(start_date, start_time)

if start_date > now + datetime.timedelta(days=31):
continue

event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"])

end_time = self.get_end_time(event_url)
Expand All @@ -95,47 +94,63 @@ def handle(self, *args, **kwargs):
end_of_day = datetime.time(23, 59, 59)
if end_date_elem: # end date but no end time
end_date_str = end_date_elem.text.strip().split(" ")[-1]
end_date = datetime.combine(
end_date = datetime.datetime.combine(
datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day
)

else: # no end date or end time
end_date = datetime.combine(start_date, end_of_day)
end_date = datetime.datetime.combine(start_date, end_of_day)

Event.objects.update_or_create(
name=name,
defaults={
"event_type": "",
"event_type": "Penn Today",
"image_url": "",
"start": start_date,
"end": end_date,
"start": timezone.make_aware(start_date),
"end": timezone.make_aware(end_date),
"location": location,
"website": event_url,
"description": description,
"email": "",
},
)

self.stdout.write("Uploaded Events!")
self.stdout.write("Uploaded Penn Today Events!")

def connect_and_parse_html(self, event_url, condition):
try:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(
service=FirefoxService(GeckoDriverManager().install()), options=options
)

driver.get(event_url)
print("WAITING FOR ELEMENT")
element = WebDriverWait(driver, 10).until(condition)
print("ELEMENT FOUND")

html_content = element.get_attribute("innerHTML")
driver.quit()
return BeautifulSoup(html_content, "html.parser")
except ConnectionError:
print("Connection Error to webdriver")
return None

def get_end_time(event_url):
driver = webdriver.Chrome()
driver.get(event_url)
event_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
def get_end_time(self, event_url):
end_time_soup = self.connect_and_parse_html(
event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
)
end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser")

end_time_range_str = (
end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "")
)
print(end_time_range_str)
if not end_time_range_str or ALL_DAY in end_time_range_str.lower():
driver.quit()

if (
not end_time_range_str
or ALL_DAY in end_time_range_str.lower()
or len(times := end_time_range_str.split(" - ")) <= 1
):
return None # No end time if the event is all day
times = end_time_range_str.split(" - ")
if len(times) <= 1:
driver.quit()
return None
end_time_str = times[1]
driver.quit()
return end_time_str

return times[1]

0 comments on commit e4d96c1

Please sign in to comment.