-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path9gagFinal.py
113 lines (96 loc) · 3.63 KB
/
9gagFinal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Author : [email protected]
# New script after site changed : Working as of 8th July 2013
import os
import urllib2
number=1
nextLink="http://9gag.com/"
pageNumber=1
def FindLinks(): #required for getting page source of base page that is then passed to FindLinks2
os.system("echo Start|cat>>log")
base_link="http://9gag.com/"
base_contents=urllib2.urlopen(base_link).read()
FindLinks2(base_contents)
def FindLinks2(base_contents): #find links like http://9gag.com/gag/6482858 that can be passed to the Extract Image Function
global number
initial=0
breakval=0
for i in range (0,10): #keep on finding links until the next link is not equal to first one
start_link=base_contents.find('data-entry-url="',initial)+16
end_link=base_contents.find('"',start_link)
link=base_contents[start_link:end_link]
#print link
initial=end_link
if (i==0):
breakval=initial
if (initial==breakval and i!=0):
print "break!"
break
temp=str(number)+". "+link
print temp
os.system("echo %s|cat>>log" %temp)
ExtractImage(link)
number=number+1
def FindTitle(temp): #Find title of image to be downloaded
start_title_of_image=temp.find('title>9GAG - ')+13
end_title_of_image=temp.find('</title', start_title_of_image)
title_of_image=temp[start_title_of_image:end_title_of_image]
#print title_of_image
return title_of_image
def Rename(actual_link,toName): #Give the downloaded file a more meaningful name
fromName=actual_link[actual_link.find("photo/")+6:]
toName=toName+".jpg"
#however if there are space in between it will lead to problem hence replace all spaces in toName to '_'
toName=toName.replace(' ','_')
os.system("mv %s %s" %(fromName,toName))
def ExtractImage(inputUrl): #Will extract image from a page like http://9gag.com/gag/6482858
Page_contents=urllib2.urlopen(inputUrl).read()
image_link_start=Page_contents.find('href="')
image_link_end=Page_contents.find('" />',image_link_start)
temp=Page_contents[image_link_start:image_link_end]
#print temp
start_actual_link=temp.find("//")+2
end_actual_link=temp.find('.jpg',start_actual_link)+4
actual_link=temp[start_actual_link:end_actual_link]
#print actual_link
title_of_image=FindTitle(Page_contents)
print title_of_image
os.system("echo %s|cat>>log" %title_of_image)
print
print
os.system("echo|cat>>log")
os.system("echo|cat>>log")
os.system("wget %s >/dev/null 2>&1" %actual_link)
Rename(actual_link,title_of_image)
def FindLinks3(base_link): #find page source of next hot page and return it
global nextLink
base_contents=urllib2.urlopen(base_link).read()
start_link=base_contents.find('load-more-post" href="')+22
end_link=base_contents.find('"',start_link)
link="http://9gag.com/"+base_contents[start_link:end_link]
if(nextLink==link):
start_link=base_contents.find('load-more-post" href="')+22
end_link=base_contents.find('"',start_link)
link="http://9gag.com/"+base_contents[start_link:end_link]
nextLink=link
print link
pageSource=urllib2.urlopen(link).read()
return pageSource
os.system("clear")
n=input("Enter no. of pages : ")
nextPageSource=urllib2.urlopen("http://9gag.com/").read()
for i in range (0,n):
if(i==0):
os.system("clear")
os.system("echo Page Number : %d |cat>>log" %pageNumber)
FindLinks()
else:
os.system("clear")
os.system("echo Page Number : %d |cat>>log" %pageNumber)
nextPageSource=FindLinks3(nextLink)
FindLinks2(nextPageSource)
pageNumber=pageNumber+1
number=number-1
os.system("echo End|cat>>log")
os.system("echo Total Images downloaded : %d | cat>>log" %number)
os.system("echo ------------------------------------------|cat>>log")
os.system("echo|cat")