-
Notifications
You must be signed in to change notification settings - Fork 2
/
netflixMovieScraper.py
197 lines (179 loc) · 8.01 KB
/
netflixMovieScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""
What are we scrapping?
https://www.netflix.com/in/browse/genre/34399
- Name of the movie : <h1> tag class = title-title
- Maturity Number : <span> tag class = maturity-number
- Duration : <span> tag class = duration
- Synopsis : <div> class = title-info-synopsis
- Main Genre : <a> class = title-info-metadata-item item-genre
- Tag : <span> class = nm-collections-row-name #From Home Page
- Image : <img> class= = nm-collections-title-img #From Home Page
- Related Genres - <a> class = more-details-item item-genres
- Movie Link - <a> class = nm-collections-title nm-collections-link
- Cast - <span> class = title-data-info-item-list
google - class="yuRUbf" -> first wikipedia link
wikipedia - class="suggestion-description" - if description has the word 'film' then go ahead
- class="suggestion-dropdown" - has 0 children nodes then ignore
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import json
import time
import re
import Spoiler_Detection
def get_genre(website_link):
html_data = requests.get(website_link).text
soup = BeautifulSoup(html_data, "html.parser")
list_genre = []
for link in soup.find_all("a", {"class":"more-details-item item-genres"}):
list_genre.append(link.text)
return list_genre
def open_wiki(name,year):
# options = Options()
# options.headless = False
# options.add_argument("--window-size=1920,1200")
# DRIVER_PATH = '/path/to/chromedriver'
# driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
p = driver.current_window_handle
driver.execute_script("window.open('about:blank','secondtab');")
driver.switch_to.window("secondtab")
driver.get('https://www.google.com/')
search = driver.find_element(By.CLASS_NAME,"gLFyf")
name = name + " " + year + " film wikipedia english"
search.send_keys(name)
search.send_keys(Keys.RETURN)
wiki_page = driver.find_element(By.CLASS_NAME, "yuRUbf")
wiki_link = wiki_page.find_element(By.XPATH,".//*")
try:
summary = ""
link = wiki_link.get_attribute("href")
if not link.startswith("https://en.wikipedia.org"):
driver.close()
driver.switch_to.window(p)
return None
# URL of the Wikipedia page you want to scrape
# Send a GET request to the URL
response = requests.get(link)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the section you want to scrape by its heading
section = soup.find('span', {'id': 'Plot'})
next_section = soup.find('span', {'id': 'Cast'})
# Find all the HTML elements between the section heading and the next heading
paragraph = section.find_next('p')
summary = ""
while paragraph!= next_section.find_next('p'):
summary = summary + paragraph.get_text()
section = paragraph
paragraph = section.find_next('p')
summary = re.sub("[\(\[].*?[\)\]]", "",summary)
driver.close()
driver.switch_to.window(p)
return summary
except BaseException:
driver.close()
driver.switch_to.window(p)
return None
def save_json(filename, json_list):
with open(f"./{filename}.json", "w") as outfile:
for json_object in json_list:
outfile.write(json_object)
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.headless = False
options.add_argument("--window-size=1920,1200")
DRIVER_PATH = r'D:\School\College\Books-20210104T150531Z-001\Books\Sem 6\Mini Project\Mini-Project\ScrappingLogic\chromedriver.exe'
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get('https://www.netflix.com/in/browse/genre/34399')
all_links = driver.find_elements(By.TAG_NAME,"a")
count = 1
section = 2
movie_dictionary = {}
json_list =[]
list_tag = []
all_movies = []
index_count = 1
for link in all_links:
url = link.get_attribute("href")
if not (url.startswith("https://www.netflix.com/in/title/")):
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
if count>40: #home page has 14 sections each with 40 movies
section = section + 1
count = 1
save_json(tag,json_list)
json_list = []
tag = driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[2]/main/section[{section}]/h2").text
print(url)
xpath = f"/html/body/div[1]/div/div[2]/main/section[{section}]/div/ul/li[{count}]/a/img"
image_url = driver.find_element(By.XPATH,xpath).get_attribute("src")
movie_id = url.lstrip("https://www.netflix.com/in/title/")
count = count + 1
movie_dictionary["id"] = int(movie_id)
movie_dictionary["tag"] = tag
if tag in ["Action & Adventure", "Kids & Family Movies", "Tamil-Language Movies", "Horror Movies"]:
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
# if tag == "Action & Adventure" or tag == "Children & Family Movies":
# movie_dictionary["genre"] = ""
# movie_dictionary["related-genre"] = ""
# continue
movie_dictionary["image"] = image_url
movie_dictionary["source"] = url
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
regex = re.compile(".*?\((.*?)\)")
name = soup.find("h1", {"class":"title-title"}).text
# if name in ["AKA","Guillermo del Toro’s Pinocchio","Minions & More Volume 2", "STAND BY ME Doraemon 2","Pokémon the Movie: Secrets of the Jungle","Motu Patlu: Kung Fu Kings","The Elephant Whisperers"]:
# movie_dictionary["genre"] = ""
# movie_dictionary["related-genre"] = ""
# continue
result = re.sub("[\(\[].*?[\)\]]", "", name).rstrip(" ")
if result == "The Matchmaker" or result == "Muoi: The Curse Returns" or result == "The Light We Carry: Michelle Obama and Oprah Winfrey":
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
movie_dictionary["index"] = index_count
index_count = index_count + 1
movie_dictionary["name"]= result
movie_dictionary["genre"] = soup.find("a",{"class":"title-info-metadata-item item-genre"}).text
movie_dictionary["maturity"] = soup.find("span", {"class":"maturity-number"}).text
movie_dictionary["duration"] = soup.find("span", {"class":"duration"}).text
movie_dictionary["synopsis"] = soup.find("div", {"class":"title-info-synopsis"}).text
movie_dictionary["year"] = soup.find("span", {"class":"title-info-metadata-item item-year"}).text
try:
movie_dictionary["related-genre"] = get_genre(url)
except BaseException:
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
try:
plot = open_wiki(movie_dictionary["name"],movie_dictionary["year"])
except BaseException:
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
if plot == None:
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
spoilers = Spoiler_Detection.generate_spoilers(plot)
movie_dictionary["spoilers"] = spoilers
json_object = json.dumps(movie_dictionary, indent=4)
if json_object in all_movies:
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
continue
all_movies.append(json_object)
json_list.append(json_object)
movie_dictionary["genre"] = ""
movie_dictionary["related-genre"] = ""
save_json("movies",all_movies)