-
Notifications
You must be signed in to change notification settings - Fork 0
/
hindunames.py
120 lines (89 loc) · 3.39 KB
/
hindunames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
from os import name
from urllib.request import urlopen
import threading
import xmltodict
baseUrl = "https://hindunames.net/hindu-baby-names?page="
name_dict = []
# def save_page(x):
pages = 115
# pages = 115
x = 1
# while x < 115:
while x <= pages:
# t1 = threading.Thread(target=save_page, args=(x))
# t1.start()
# savePage(x)
name_xml = ""
print("page fetching started : "+str(x))
page = urlopen(baseUrl+str(x))
html_bytes = page.read()
html = str(html_bytes.decode("utf-8"))
start_i = html.find(
'<div class="bg-white overflow-hidden shadow-xl sm:rounded-lg')
end_i = html.rfind('<div class="p-5 content-center">')
# print(str(start_i)+" "+str(end_i))
name_xml = html[start_i:end_i]
# print(len(name_xml))
header_start = name_xml.find("<header")
header_end = name_xml.find("</header>")+9
name_xml = name_xml.replace(name_xml[header_start:header_end], '')
name_xml = name_xml.replace('Girl name', 'F')
name_xml = name_xml.replace('Boy name', 'M')
name_xml = name_xml.replace('&', 'and') # to remove & in 7th page
# if x == 7: #only to find errors in page 7
# # print(name_xml)
# text_file = open("./77.xml", "w")
# text_file.write(name_xml)
# text_file.close()
data_dict = xmltodict.parse(name_xml)
# print(len(data_dict['div']['div']))
for name in data_dict['div']['div']:
# print(type(name))
if '@class' not in name: # to remove google ad html element and get only 50 name elements
namedict = {}
namedict['name'] = name['div']['div']['div'][0]['div'][0]['h2']['a']['#text']
namedict['sex'] = name['div']['div']['div'][0]['div'][0]['h2']['span']['#text']
namedict['meaning'] = name['div']['div']['div'][0]['div'][1]['h2']
name_dict.append(namedict)
# print(name['div']['div']['div'][0]['div'][0]['h2']['a']['#text'])
# print(name['div']['div']['div'][0]
# ['div'][0]['h2']['span']['#text'])
# print(name['div']['div']['div'][0]['div'][1]['h2'])
# print(namedict['name'])
# print(namedict['sex'])
# print(namedict['meaning'])
# print()
names_json = json.dumps(data_dict)
# print(names_json)
# # writing to file
# text_file = open("./"+str(x)+".json", "w")
# text_file.write(names_json)
# text_file.close()
# text_file = open("./"+str(x)+".xml", "w")
# text_file.write(name_xml)
# text_file.close()
# with open("./"+str(x)+".json", "w") as json_file:
# json_file.write(names_json)
# json_file.close()
print("page completed : "+str(x))
x += 1
print("total names : "+str(len(name_dict)))
with open("./nameData.json", "w") as json_file:
json_file.write(json.dumps(name_dict))
# json_file.write(json.dumps({"names":name_dict}))
json_file.close()
# x = 1
# # while x < 115:
# while x < pages:
# with open("./"+str(x)+".xml") as xml_file:
# data_dict = xmltodict.parse(xml_file.read())
# xml_file.close()
# # generate the object using json.dumps()
# # corresponding to json data
# json_data = json.dumps(data_dict)
# # Write the json data to output
# # json file
# with open("./"+str(x)+".json") as json_file:
# json_file.write(json_data)
# json_file.close()