Skip to content

Commit

Permalink
fix: scraper and period location (#289)
Browse files Browse the repository at this point in the history
* feat: adding new case for scrapper for instance multiple periods and multiple locations

* feat: adding case of multiple locations and periods devided with a slash
  • Loading branch information
JasonNotJson authored May 17, 2023
1 parent 31ca68e commit ddc81ea
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 20 deletions.
9 changes: 6 additions & 3 deletions src/lambda/syllabus-scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def execute(self):
:return: list of courses
"""
pages = self.get_max_page()
course_pages = run_concurrently(self.scrape_catalog, range(pages), self.worker)
course_pages = run_concurrently(
self.scrape_catalog, range(pages), self.worker)
course_ids = (course_id for page in course_pages for course_id in page)
results = run_concurrently(self.scrape_course, course_ids, self.worker)
return results
Expand Down Expand Up @@ -86,8 +87,10 @@ def scrape_course(self, course_id):
"p": 'string', # subtitle
}
"""
req_en = requests.Request(url=build_url(lang='en', course_id=course_id), headers=header)
req_jp = requests.Request(url=build_url(lang='jp', course_id=course_id), headers=header)
req_en = requests.Request(url=build_url(
lang='en', course_id=course_id), headers=header)
req_jp = requests.Request(url=build_url(
lang='jp', course_id=course_id), headers=header)
parsed_en = html.fromstring(requests.urlopen(req_en).read())
parsed_jp = html.fromstring(requests.urlopen(req_jp).read())
info_en = parsed_en.xpath(query["info_table"])[0]
Expand Down
49 changes: 32 additions & 17 deletions src/lambda/syllabus-scraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,23 @@ def merge_period_location(periods, locations):
p["l"] = locations[0]
return periods
# TODO find other cases
# Case 2: More no. of periods than no. of locations
zipped = list(itertools.zip_longest(periods, locations))
for (p, loc) in zipped:
p["l"] = loc
occurrences.append(p)
# Case 2: multiple periods and multiple locations string divided with a '/'
elif len(periods) == len(locations):
for i in range(len(periods)):
locs = locations[i].split('/')
temp_period = periods[i]
for loc in locs:
temp_period_copy = temp_period.copy()
temp_period_copy["l"] = loc.strip()
occurrences.append(temp_period_copy)
return occurrences

# Case 3: More no. of periods than no. of locations
else:
zipped = list(itertools.zip_longest(periods, locations))
for (p, loc) in zipped:
p["l"] = loc
occurrences.append(p)
return occurrences


Expand Down Expand Up @@ -191,15 +203,15 @@ def parse_location(loc):
rooms = []
locations = loc.split('/')
for l in locations:
match = re.search(r'0(\d):(.*)', l)
count, classroom = int(match.group(1)) - 1, match.group(2)
classroom = rename_location(classroom)
# Sub-case: two location records for same period
if count >= len(rooms):
rooms.append(classroom)
else:
rooms.__setitem__(count, rooms[count] + "/" + classroom)
return rooms
matches = re.findall(r'0(\d):(.*)', l)
for match in matches:
count, classroom = int(match[0]) - 1, match[1]
classroom = rename_location(classroom)
if count >= len(rooms):
rooms.append([classroom])
else:
rooms[count].append(classroom)
return [room for sublist in rooms for room in sublist]


def parse_lang(lang):
Expand Down Expand Up @@ -243,7 +255,8 @@ def parse_period(schedule):
return [{"d": -1, "p": -1}]
if occ == "othersOn demand":
return [{"d": -1, "p": 0}]
occ_matches = re.finditer(r'(Mon|Tues|Wed|Thur|Fri|Sat|Sun)\.(\d-\d|\d|On demand)', occ)
occ_matches = re.finditer(
r'(Mon|Tues|Wed|Thur|Fri|Sat|Sun)\.(\d-\d|\d|On demand)', occ)
occurrences = []
for match in occ_matches:
day, period = match.group(1), match.group(2)
Expand Down Expand Up @@ -300,8 +313,10 @@ def upload_to_s3(syllabus, school):
'RequestCharged': 'requester'
}
"""
s3 = boto3.resource('s3', region_name="ap-northeast-1", verify=False, config=Config(signature_version='s3v4'))
syllabus_object = s3.Object(os.getenv('BUCKET_NAME'), os.getenv('OBJECT_PATH') + school + '.json')
s3 = boto3.resource('s3', region_name="ap-northeast-1",
verify=False, config=Config(signature_version='s3v4'))
syllabus_object = s3.Object(
os.getenv('BUCKET_NAME'), os.getenv('OBJECT_PATH') + school + '.json')
body = bytes(json.dumps(list(syllabus)).encode('UTF-8'))
resp = syllabus_object.put(
ACL='private',
Expand Down

0 comments on commit ddc81ea

Please sign in to comment.