Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape teacher sigarra url #120

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ num_course_units=6500
num_course_metadata=10000
num_classes=12000
num_slots=32000
num_professor_link=3500
num_slot_professor=55000
num_slot_class=32000
num_professors=3500
Expand Down
13 changes: 11 additions & 2 deletions src/scrapper/database/dbs/create_db_sqlite3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,24 @@ CREATE TABLE `slot_class` (
PRIMARY KEY (`slot_id`, `class_id`)
);

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
);

-- --------------------------------------------------------

--
-- Table structure for table `class_professor`
-- Table structure for table `schedule_professor`
--

CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
);

Expand All @@ -133,7 +141,8 @@ CREATE TABLE `slot_professor` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
);

-- --------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/scrapper/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
print("Saving scrapper info...")
db = Database()
db.insert('info', {'date': datetime.now()})
db.connection.close()
db.connection.close()
7 changes: 5 additions & 2 deletions src/scrapper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,21 @@ class Slot(scrapy.Item):
professor_id = scrapy.Field()
last_updated = scrapy.Field()


class SlotClass(scrapy.Item):
slot_id = scrapy.Field()
class_id = scrapy.Field()

class ProfessorLink(scrapy.Item):
id = scrapy.Field()
link = scrapy.Field()

class SlotProfessor(scrapy.Item):
slot_id = scrapy.Field()
professor_id = scrapy.Field()

professor_link_id = scrapy.Field()

class Professor(scrapy.Item):
id = scrapy.Field()
professor_acronym = scrapy.Field()
professor_name = scrapy.Field()
professor_url = scrapy.Field()
49 changes: 30 additions & 19 deletions src/scrapper/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm


class MySQLPipeline():
class DBPipeline():
def __init__(self):
self.open_config()
self.db = Database()
Expand Down Expand Up @@ -79,9 +79,9 @@ def process_pbar(self):
# -------------------------------------------------------------------------


class FacultyPipeline(MySQLPipeline):
class FacultyPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_faculties'])
self.table_name = 'faculty'

Expand All @@ -91,9 +91,9 @@ def process_item(self, item, spider):
return item


class CoursePipeline(MySQLPipeline):
class CoursePipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_courses'])
self.table_name = 'course'

Expand All @@ -103,9 +103,9 @@ def process_item(self, item, spider):
return item


class CourseUnitPipeline(MySQLPipeline):
class CourseUnitPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_course_units'])
self.table_name = 'course_unit'

Expand All @@ -115,9 +115,9 @@ def process_item(self, item, spider):
return item


class CourseMetadataPipeline(MySQLPipeline):
class CourseMetadataPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_course_metadata'])
self.table_name = 'course_metadata'
Expand All @@ -128,9 +128,9 @@ def process_item(self, item, spider):
return item


class ClassPipeline(MySQLPipeline):
class ClassPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_classes'])
self.table_name = 'class'

Expand All @@ -140,9 +140,9 @@ def process_item(self, item, spider):
return item


class SlotPipeline(MySQLPipeline):
class SlotPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_slots'])
self.table_name = 'slot'

Expand All @@ -152,9 +152,9 @@ def process_item(self, item, spider):
return item


class SlotClassPipeline(MySQLPipeline):
class SlotClassPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_slot_class'])
self.table_name = 'slot_class'

Expand All @@ -164,9 +164,9 @@ def process_item(self, item, spider):
return item


class SlotProfessorPipeline(MySQLPipeline):
class SlotProfessorPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_slot_professor'])
self.table_name = 'slot_professor'
Expand All @@ -176,10 +176,21 @@ def process_item(self, item, spider):
super().process_item(item, spider)
return item

class ProfessorLinkPipeline(DBPipeline):
def __init__(self):
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_professor_link'])
self.table_name = 'professor_link'

def process_item(self, item, spider):
if isinstance(item, items.ProfessorLink):
super().process_item(item, spider)
return item

class ProfessorsPipeline(MySQLPipeline):
class ProfessorsPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_professors'])
self.table_name = 'professor'

Expand Down
1 change: 1 addition & 0 deletions src/scrapper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
'scrapper.pipelines.ClassPipeline': 0,
'scrapper.pipelines.SlotPipeline': 0,
'scrapper.pipelines.SlotProfessorPipeline': 0,
'scrapper.pipelines.ProfessorLinkPipeline': 0,
'scrapper.pipelines.ProfessorsPipeline': 0,
'scrapper.pipelines.SlotClassPipeline': 0
}
Expand Down
32 changes: 29 additions & 3 deletions src/scrapper/spiders/slot_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scrapper.settings import CONFIG, PASSWORD, USERNAME

from ..database.Database import Database
from ..items import Slot, Class, SlotProfessor, Professor, SlotClass
from ..items import Slot, Class, SlotProfessor, Professor, SlotClass, ProfessorLink


def get_class_id(course_unit_id, class_name):
Expand Down Expand Up @@ -102,6 +102,23 @@ def check_login_response(self, response):
response.status), flush=True)
self.log('Login Failed. HTTP Error {}'.format(response.status))

def professor_link_exists(self, id: int) -> bool:
exists = False
db = Database()
sql = """
SELECT id
FROM professor_link
WHERE id = {}
""".format(id)

db.cursor.execute(sql)
if db.cursor.fetchone() != None:
exists = True

db.connection.close()

return exists

def classUnitRequests(self):
db = Database()
sql = """
Expand Down Expand Up @@ -180,7 +197,8 @@ def extractSchedule(self, response):
yield Professor(
id=sigarra_id,
professor_acronym=teacher["acronym"],
professor_name=name
professor_name=name,
professor_url=teacher["sigarra_url"]
)

for current_class in schedule["classes"]:
Expand All @@ -205,10 +223,18 @@ def extractSchedule(self, response):
for teacher in schedule["persons"]:
(sigarra_id, name) = self.get_professor_info(
teacher)

professor_link_id = schedule["id"]
if not self.professor_link_exists(professor_link_id):
yield ProfessorLink(
id=schedule["id"],
link=teacher["sigarra_url"]
)

yield SlotProfessor(
slot_id=schedule["id"],
professor_id=sigarra_id
professor_id=sigarra_id,
professor_link_id=schedule["id"]
)

for current_class in schedule["classes"]:
Expand Down
1 change: 1 addition & 0 deletions src/scripts/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def dump(self):
self.dump_table("class", con, f)
self.dump_table("slot", con, f)
self.dump_table("slot_class", con, f)
self.dump_table("professor_link", con, f)
self.dump_table("slot_professor", con, f)
f.close()

Expand Down
10 changes: 9 additions & 1 deletion src/scripts/dump/schema/create_db_sqlite3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ CREATE TABLE `slot_class` (
PRIMARY KEY (`slot_id`, `class_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------

--
Expand All @@ -128,8 +133,10 @@ CREATE TABLE `slot_class` (
CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

Expand All @@ -142,7 +149,8 @@ CREATE TABLE `slot_professor` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------
Expand Down
10 changes: 9 additions & 1 deletion src/scripts/dump/schema/schema_mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,13 @@ CREATE TABLE `slot_class` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------
Expand All @@ -144,8 +150,10 @@ CREATE TABLE `professor` (
CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

Expand Down