From 6d3389d4b01a089e8749974cf4cb55dfdd79e453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Palma?= Date: Sat, 24 Aug 2024 15:32:22 +0100 Subject: [PATCH] feat: scrape teacher sigarra url --- src/config.ini | 1 + .../database/dbs/create_db_sqlite3.sql | 13 ++++- src/scrapper/info.py | 2 +- src/scrapper/items.py | 7 ++- src/scrapper/pipelines.py | 49 ++++++++++++------- src/scrapper/settings.py | 1 + src/scrapper/spiders/slot_spider.py | 32 ++++++++++-- src/scripts/dump.py | 1 + src/scripts/dump/schema/create_db_sqlite3.sql | 10 +++- src/scripts/dump/schema/schema_mysql.sql | 10 +++- 10 files changed, 97 insertions(+), 29 deletions(-) diff --git a/src/config.ini b/src/config.ini index f9ddf53..38df9fa 100644 --- a/src/config.ini +++ b/src/config.ini @@ -27,6 +27,7 @@ num_course_units=6500 num_course_metadata=10000 num_classes=12000 num_slots=32000 +num_professor_link=3500 num_slot_professor=55000 num_slot_class=32000 num_professors=3500 diff --git a/src/scrapper/database/dbs/create_db_sqlite3.sql b/src/scrapper/database/dbs/create_db_sqlite3.sql index fd70f35..049a659 100644 --- a/src/scrapper/database/dbs/create_db_sqlite3.sql +++ b/src/scrapper/database/dbs/create_db_sqlite3.sql @@ -112,16 +112,24 @@ CREATE TABLE `slot_class` ( PRIMARY KEY (`slot_id`, `class_id`) ); +CREATE TABLE `professor_link` ( + `id` INTEGER PRIMARY KEY, + `link` varchar(256) +); + -- -------------------------------------------------------- + -- --- Table structure for table `class_professor` +-- Table structure for table `schedule_professor` -- CREATE TABLE `slot_professor` ( `slot_id` INTEGER NOT NULL, `professor_id` INTEGER NOT NULL, + `professor_link_id` INTEGER NOT NULL, FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, PRIMARY KEY (`slot_id`, `professor_id`) ); @@ -133,7 +141,8 @@ CREATE TABLE `slot_professor` ( CREATE TABLE `professor` ( `id` INTEGER PRIMARY KEY, `professor_acronym` varchar(16), - `professor_name` varchar(100) + `professor_name` varchar(100), + `professor_url` varchar(128) ); -- -------------------------------------------------------- diff --git a/src/scrapper/info.py b/src/scrapper/info.py index 883f7e2..c9a5437 100644 --- a/src/scrapper/info.py +++ b/src/scrapper/info.py @@ -4,4 +4,4 @@ print("Saving scrapper info...") db = Database() db.insert('info', {'date': datetime.now()}) -db.connection.close() \ No newline at end of file +db.connection.close() diff --git a/src/scrapper/items.py b/src/scrapper/items.py index 3e777d8..391d675 100644 --- a/src/scrapper/items.py +++ b/src/scrapper/items.py @@ -64,18 +64,21 @@ class Slot(scrapy.Item): professor_id = scrapy.Field() last_updated = scrapy.Field() - class SlotClass(scrapy.Item): slot_id = scrapy.Field() class_id = scrapy.Field() +class ProfessorLink(scrapy.Item): + id = scrapy.Field() + link = scrapy.Field() class SlotProfessor(scrapy.Item): slot_id = scrapy.Field() professor_id = scrapy.Field() - + professor_link_id = scrapy.Field() class Professor(scrapy.Item): id = scrapy.Field() professor_acronym = scrapy.Field() professor_name = scrapy.Field() + professor_url = scrapy.Field() diff --git a/src/scrapper/pipelines.py b/src/scrapper/pipelines.py index 5645ac1..7eaceeb 100644 --- a/src/scrapper/pipelines.py +++ b/src/scrapper/pipelines.py @@ -10,7 +10,7 @@ from tqdm import tqdm -class MySQLPipeline(): +class DBPipeline(): def __init__(self): self.open_config() self.db = Database() @@ -79,9 +79,9 @@ def process_pbar(self): # ------------------------------------------------------------------------- -class FacultyPipeline(MySQLPipeline): +class FacultyPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_faculties']) self.table_name = 'faculty' @@ -91,9 +91,9 @@ def process_item(self, item, spider): return item -class CoursePipeline(MySQLPipeline): +class CoursePipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_courses']) self.table_name = 'course' @@ -103,9 +103,9 @@ def process_item(self, item, spider): return item -class CourseUnitPipeline(MySQLPipeline): +class CourseUnitPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_course_units']) self.table_name = 'course_unit' @@ -115,9 +115,9 @@ def process_item(self, item, spider): return item -class CourseMetadataPipeline(MySQLPipeline): +class CourseMetadataPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int( self.config['statistics']['num_course_metadata']) self.table_name = 'course_metadata' @@ -128,9 +128,9 @@ def process_item(self, item, spider): return item -class ClassPipeline(MySQLPipeline): +class ClassPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_classes']) self.table_name = 'class' @@ -140,9 +140,9 @@ def process_item(self, item, spider): return item -class SlotPipeline(MySQLPipeline): +class SlotPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_slots']) self.table_name = 'slot' @@ -152,9 +152,9 @@ def process_item(self, item, spider): return item -class SlotClassPipeline(MySQLPipeline): +class SlotClassPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_slot_class']) self.table_name = 'slot_class' @@ -164,9 +164,9 @@ def process_item(self, item, spider): return item -class SlotProfessorPipeline(MySQLPipeline): +class SlotProfessorPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int( self.config['statistics']['num_slot_professor']) self.table_name = 'slot_professor' @@ -176,10 +176,21 @@ def process_item(self, item, spider): super().process_item(item, spider) return item +class ProfessorLinkPipeline(DBPipeline): + def __init__(self): + DBPipeline.__init__(self) + self.expected_num = int( + self.config['statistics']['num_professor_link']) + self.table_name = 'professor_link' + + def process_item(self, item, spider): + if isinstance(item, items.ProfessorLink): + super().process_item(item, spider) + return item -class ProfessorsPipeline(MySQLPipeline): +class ProfessorsPipeline(DBPipeline): def __init__(self): - MySQLPipeline.__init__(self) + DBPipeline.__init__(self) self.expected_num = int(self.config['statistics']['num_professors']) self.table_name = 'professor' diff --git a/src/scrapper/settings.py b/src/scrapper/settings.py index e360dde..7a10dd5 100644 --- a/src/scrapper/settings.py +++ b/src/scrapper/settings.py @@ -83,6 +83,7 @@ 'scrapper.pipelines.ClassPipeline': 0, 'scrapper.pipelines.SlotPipeline': 0, 'scrapper.pipelines.SlotProfessorPipeline': 0, + 'scrapper.pipelines.ProfessorLinkPipeline': 0, 'scrapper.pipelines.ProfessorsPipeline': 0, 'scrapper.pipelines.SlotClassPipeline': 0 } diff --git a/src/scrapper/spiders/slot_spider.py b/src/scrapper/spiders/slot_spider.py index 46ca5da..25714c5 100644 --- a/src/scrapper/spiders/slot_spider.py +++ b/src/scrapper/spiders/slot_spider.py @@ -12,7 +12,7 @@ from scrapper.settings import CONFIG, PASSWORD, USERNAME from ..database.Database import Database -from ..items import Slot, Class, SlotProfessor, Professor, SlotClass +from ..items import Slot, Class, SlotProfessor, Professor, SlotClass, ProfessorLink def get_class_id(course_unit_id, class_name): @@ -102,6 +102,23 @@ def check_login_response(self, response): response.status), flush=True) self.log('Login Failed. HTTP Error {}'.format(response.status)) + def professor_link_exists(self, id: int) -> bool: + exists = False + db = Database() + sql = """ + SELECT id + FROM professor_link + WHERE id = {} + """.format(id) + + db.cursor.execute(sql) + if db.cursor.fetchone() != None: + exists = True + + db.connection.close() + + return exists + def classUnitRequests(self): db = Database() sql = """ @@ -180,7 +197,8 @@ def extractSchedule(self, response): yield Professor( id=sigarra_id, professor_acronym=teacher["acronym"], - professor_name=name + professor_name=name, + professor_url=teacher["sigarra_url"] ) for current_class in schedule["classes"]: @@ -205,10 +223,18 @@ def extractSchedule(self, response): for teacher in schedule["persons"]: (sigarra_id, name) = self.get_professor_info( teacher) + + professor_link_id = schedule["id"] + if not self.professor_link_exists(professor_link_id): + yield ProfessorLink( + id=schedule["id"], + link=teacher["sigarra_url"] + ) yield SlotProfessor( slot_id=schedule["id"], - professor_id=sigarra_id + professor_id=sigarra_id, + professor_link_id=schedule["id"] ) for current_class in schedule["classes"]: diff --git a/src/scripts/dump.py b/src/scripts/dump.py index 7f0001e..4627e47 100644 --- a/src/scripts/dump.py +++ b/src/scripts/dump.py @@ -38,6 +38,7 @@ def dump(self): self.dump_table("class", con, f) self.dump_table("slot", con, f) self.dump_table("slot_class", con, f) + self.dump_table("professor_link", con, f) self.dump_table("slot_professor", con, f) f.close() diff --git a/src/scripts/dump/schema/create_db_sqlite3.sql b/src/scripts/dump/schema/create_db_sqlite3.sql index 3fb45b5..5845cea 100644 --- a/src/scripts/dump/schema/create_db_sqlite3.sql +++ b/src/scripts/dump/schema/create_db_sqlite3.sql @@ -119,6 +119,11 @@ CREATE TABLE `slot_class` ( PRIMARY KEY (`slot_id`, `class_id`) ) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; +CREATE TABLE `professor_link` ( + `id` INTEGER PRIMARY KEY, + `link` varchar(256) +) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; + -- -------------------------------------------------------- -- @@ -128,8 +133,10 @@ CREATE TABLE `slot_class` ( CREATE TABLE `slot_professor` ( `slot_id` INTEGER NOT NULL, `professor_id` INTEGER NOT NULL, + `professor_link_id` INTEGER NOT NULL, FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, PRIMARY KEY (`slot_id`, `professor_id`) ) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; @@ -142,7 +149,8 @@ CREATE TABLE `slot_professor` ( CREATE TABLE `professor` ( `id` INTEGER PRIMARY KEY, `professor_acronym` varchar(16), - `professor_name` varchar(100) + `professor_name` varchar(100), + `professor_url` varchar(128) ) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; -- -------------------------------------------------------- diff --git a/src/scripts/dump/schema/schema_mysql.sql b/src/scripts/dump/schema/schema_mysql.sql index 836b5a8..e606705 100644 --- a/src/scripts/dump/schema/schema_mysql.sql +++ b/src/scripts/dump/schema/schema_mysql.sql @@ -132,7 +132,13 @@ CREATE TABLE `slot_class` ( CREATE TABLE `professor` ( `id` INTEGER PRIMARY KEY, `professor_acronym` varchar(16), - `professor_name` varchar(100) + `professor_name` varchar(100), + `professor_url` varchar(128) +) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; + +CREATE TABLE `professor_link` ( + `id` INTEGER PRIMARY KEY, + `link` varchar(256) ) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci; -- -------------------------------------------------------- @@ -144,8 +150,10 @@ CREATE TABLE `professor` ( CREATE TABLE `slot_professor` ( `slot_id` INTEGER NOT NULL, `professor_id` INTEGER NOT NULL, + `professor_link_id` INTEGER NOT NULL, FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, PRIMARY KEY (`slot_id`, `professor_id`) ) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;