-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjobs_parallel_scraping.py
92 lines (73 loc) · 2.79 KB
/
jobs_parallel_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import mysql.connector
import requests
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool
from functools import partial
# Function to create the 'jobs' table in MySQL
def create_jobs_table_parallel():
conn = mysql.connector.connect(
host='localhost',
user='root',
password='root',
database='jobs'
)
cursor = conn.cursor()
# Define the SQL query to create the 'jobs' table
create_table_query = """
CREATE TABLE IF NOT EXISTS parallel_jobs (
ID INT AUTO_INCREMENT PRIMARY KEY,
Company_Name VARCHAR(255),
Required_Skills LONGTEXT,
More_Info LONGTEXT
)
"""
cursor.execute(create_table_query)
conn.commit()
conn.close()
# Function to save job listings to MySQL
def save_to_mysql_parallel(job_listings):
conn = mysql.connector.connect(
host='localhost',
user='root',
password='root',
database='jobs'
)
cursor = conn.cursor()
for job in job_listings:
company_name = job['Company Name']
required_skills = job['Required Skills']
more_info = job['More Info']
# Define the SQL insert query to save the data in the 'jobs' table
insert_query = "INSERT INTO parallel_jobs (company_name, required_skills, more_info) VALUES (%s, %s, %s)"
values = (company_name, required_skills, more_info)
cursor.execute(insert_query, values)
# Commit the changes and close the database connection
conn.commit()
conn.close()
# Function to find jobs
def find_jobs(url, skills_to_filter):
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')
job_listings = []
for job in jobs:
job_date = job.find('span', class_='sim-posted').span.text
if 'few' in job_date:
company_name = job.find('h3', class_='joblist-comp-name').text.strip()
skills = job.find('span', class_='srp-skills').text.strip()
more_info = job.header.h2.a['href']
skills_matched = [skill for skill in skills_to_filter if skill.lower() in skills.lower()]
if not skills_matched:
job_listings.append({
'Company Name': company_name,
'Required Skills': skills,
'More Info': more_info
})
return job_listings
# Function to scrape a page
def scrape_page_parallel(page_number, skills_to_filter):
base_url = 'https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation='
page_url = f'{base_url}&sequence={page_number}&startPage=1'
print(f'Scraping page {page_number}')
return find_jobs(page_url, skills_to_filter)