-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
119 lines (105 loc) · 4.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from bs4 import BeautifulSoup
from selenium import webdriver
from lxml import etree as et
import argparse
import csv
driver = webdriver.Chrome()
def get_dom(url):
driver.get(url)
page_content = driver.page_source
product_soup = BeautifulSoup(page_content, 'html.parser')
dom = et.HTML(str(product_soup))
return dom
# functions to extract job link
def get_job_link(job):
try:
job_link = job.xpath('./descendant::h2/a/@href')[0]
except Exception as e:
job_link = 'Not available'
return job_link
# functions to extract job title
def get_job_title(job):
try:
job_title = job.xpath('./descendant::h2/a/span/text()')[0]
except Exception as e:
job_title = 'Not available'
return job_title
# functions to extract the company name
def get_company_name(job):
try:
company_name = job.xpath('./descendant::span[@data-testid="company-name"]/text()')[0]
except Exception as e:
company_name = 'Not available'
return company_name
# functions to extract the company location
def get_company_location(job):
try:
company_location = job.xpath('./descendant::div[@data-testid="text-location"]/text()')[0]
except Exception as e:
company_location = 'Not available'
return company_location
# functions to extract salary information
def get_salary(job):
try:
salary = job.xpath('./descendant::span[@class="estimated-salary"]/span/text()')
except Exception as e:
salary = 'Not available'
if len(salary) == 0:
try:
salary = job.xpath('./descendant::div[@class="metadata salary-snippet-container"]/div/text()')[0]
except Exception as e:
salary = 'Not available'
else:
salary = salary[0]
return salary
# functions to extract job description
def get_job_desc(job):
try:
job_desc = job.xpath('./descendant::ul[@style="list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;"]/li/text()')
except Exception as e:
job_desc = ['Not available']
if job_desc:
job_desc = ",".join(job_desc)
else:
job_desc = 'Not available'
return job_desc
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='pyIndeedScraper',
description='Scrape Indeed Jobs with Python using Selenium and Beautiful Soup.')
parser.add_argument('-jobTitle', required=True, help='Title for Job, Use + instead of Spaces.')
parser.add_argument('-location', required=True, help='Location for Job, Use + instead of Spaces.')
parser.add_argument('-page', default=1, help='Page to Scrape from')
parser.add_argument('-jobtype', required=True, choices=['fulltime', 'parttime', 'internship', 'contract', 'temporary'] ,help='Type of Job: Remote, Full Time, Part Time')
args = parser.parse_args()
base_url = "https://www.indeed.com"
url = f"https://www.indeed.com/jobs?q={args.jobTitle}&l={args.location}&radius=35&start={args.page}"
jobType = args.jobtype
if jobType == 'fulltime':
url += '&sc=0kf%3Ajt%28fulltime%29%3B'
elif jobType == 'parttime':
url += '&sc=0kf%3Ajt%28parttime%29%3B'
elif jobType == 'contract':
url += '&sc=0kf%3Ajt%28contract%29%3B'
elif jobType == 'internship':
url += '&sc=0kf%3Ajt%28internship%29%3B'
elif jobType == 'temporary':
url += '&sc=0kf%3Ajt%28temporary%29%3B'
page_dom = get_dom(url)
jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
with open(f'indeed{args.jobTitle.replace("+", "-")}-{args.location.replace("+", "-")}-{args.page}.csv', 'w') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Job Title', 'Company Name', 'Company Location', 'Salary', 'Job Type', 'Job Description', 'Job Link'])
for job in jobs:
job_link = base_url + get_job_link(job)
job_title = get_job_title(job)
company_name = get_company_name(job)
company_location = get_company_location(job)
salary = get_salary(job)
job_type = jobType
job_desc = get_job_desc(job)
record = [job_title, company_name, company_location, salary, job_type, job_desc, job_link]
csvWriter.writerow(record)
csvFile.close()
driver.quit()
print("Successfully ended Script")