-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse-mcc.py
executable file
·112 lines (93 loc) · 3.84 KB
/
parse-mcc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argcomplete
import argparse
import csv
import urllib2
from bs4 import BeautifulSoup
wiki_url = "https://en.wikipedia.org/wiki/Mobile_country_code"
csv_header_data = ["MCC", "MNC", "Country", "Country Code", "Brand", "Operator", "Status", "Bands", "Note"]
def handle_h2(mode):
if mode == "":
return "testnetworks", "Test Networks", ""
elif mode == "testnetworks":
return "nationaloperators", "", ""
else:
return "internationaloperators", "International Operators", ""
def parse_h3_tag(country_tag):
country_parts = country_tag["id"]
country_parts = country_parts.replace("_", " ")
country_parts = country_parts.replace(".27", "'").replace(".28", "(").replace(".29", ")")
country_parts = country_parts.replace(".2C", ",").replace(".2F", "/")
country_parts = country_parts.split("-", 1)
country = country_parts[0].strip().encode("UTF-8")
if len(country_parts) > 1:
country_code = country_parts[1].strip().encode("UTF-8")
else:
country_code = ""
return country, country_code
def parse_table(element):
rows = element.find_all("tr")
row_entries = []
for row in rows:
cells = row.find_all("td")
if len(cells) > 0:
mcc = parse_td(cells[0]).encode("UTF-8")
mnc = parse_td(cells[1]).encode("UTF-8")
if len(mcc) > 0 and len(mnc) > 0:
brand = parse_td(cells[2]).encode("UTF-8")
operator = parse_td(cells[3]).encode("UTF-8")
status = parse_td(cells[4]).encode("UTF-8")
bands = parse_td(cells[5]).encode("UTF-8")
note = parse_td(cells[6]).encode("UTF-8")
row_entries.append([mcc, mnc, country, country_code, brand, operator, status, bands, note])
return row_entries
def parse_td(td):
[sup.extract() for sup in td.find_all("sup")]
anchor = td.find("a")
if anchor is not None:
return anchor.string
elif td.string is not None:
return td.string
else:
return ""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fetch MCC and MNC from wikipedia to a csv file.")
parser.add_argument('-o', action="store", dest="csv_file", help="Output file in csv format.",
type=argparse.FileType("w"), default="/tmp/mcc-mnc.csv")
parser.add_argument('--csv-header', action="store_true", dest="write_header", help="Write header to csv file.",
default=False)
argcomplete.autocomplete(parser)
args = parser.parse_args()
csv_file = args.csv_file
write_header = args.write_header
print("Fetching wikipedia page...")
response = urllib2.urlopen(wiki_url)
html = response.read()
print("Processing...")
html_soup = BeautifulSoup(html, "lxml")
content_container = html_soup.find(id="mw-content-text")
content = content_container.find("div", {"class" : "mw-parser-output"})
mode = ""
country = ""
country_code = ""
counter = 0
try:
writer = csv.writer(csv_file, delimiter=";")
if write_header:
writer.writerow(csv_header_data)
for element in content:
if element.name == "h2":
mode, country, country_code = handle_h2(mode)
elif element.name == "h3" and mode == "nationaloperators":
country_tag = element.find("span", class_="mw-headline")
if country_tag is not None:
country, country_code = parse_h3_tag(country_tag)
elif element.name == "table":
csv_data = parse_table(element)
counter += len(csv_data)
for csv_data_row in csv_data:
writer.writerow(csv_data_row)
finally:
print("Finished. %s entries written." % counter)
csv_file.close()