-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetaext.py
82 lines (70 loc) · 2.65 KB
/
metaext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
## Meta Extraction metaext.py ##
import urllib2
from lxml import etree
import re
from xml.etree.ElementTree import tostring
out_f = open ('out.txt', 'w')
url_f = open ('domains.csv')
urls = url_f.readlines()
print (urls)
j=0
for i in urls:
urls[j]=i.rstrip("\n")
j+=1
print (urls)
for k in urls:
k = "http://" + k + "/"
out_f.write (k + '\n')
print (k)
req = urllib2.Request(k);
req.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11')
try:
site = urllib2.urlopen(req).read()
tree = etree.HTML( site )
meta = tree.xpath( "//meta" )
for i in meta:
w = etree.tostring( i )
w = w.decode("utf-8")
w.strip('\n')
out_f.write (w)
out_f.write("\n")
except:
pass
## Organize data rmnl.py ##
f = open ("out.txt")
out_f = open ("outf.txt", 'w')
while True:
c = f.read(1)
if not c:
break
if c != '\n':
print(c)
out_f.write(c)
elif c == '\n' and f.read(1) == 'h':
print(c + 'h')
out_f.write(c + 'h')
else:
continue
## Regular Expression Search regsearch.py ##
#comp = re.compile(r'(?=(.))(?:digital|marketing|marketting|social|media|facebook|advertising|advertizing|ad|google|adwords|seo|search|engine|optimisation|optimization|inbound|website|online)', flags=re.IGNORECASE)
open_f = open("outfbkup.txt")
out_f = open ("names.txt",'w')
for line in open_f:
line = line.strip('\n')
if re.search(r"Website development|Inbound marketing|inbound marketting|Social Media|Marketing company|SEO|Search Engine Optimisation|Google adwords|online advertising services|Facebook|marketing|Social media|marketing",line,re.IGNORECASE):
out_f.write(line+'\n')
else:
print (line)
## Extract Keywords and URL final.py **
#comp = re.compile(r'(?=(.))(?:digital|marketing|marketting|social|media|facebook|advertising|advertizing|ad|google|adwords|seo|search|engine|optimisation|optimization|inbound|website|online)', flags=re.IGNORECASE)
open_f = open("names.txt")
out_f = open ("finalop.txt",'w')
for line in open_f:
line = line.strip('\n')
urls = re.findall('^http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
urls[0] = urls[0].rstrip("meta")
keys = re.findall(r"Website development|Inbound marketing|inbound marketting|Social Media|Marketing company|SEO|Search Engine Optimisation|Google adwords|online advertising services|Facebook|marketing|Social media|marketing",line,re.IGNORECASE)
out_f.write(urls[0]+" : ")
for k in keys:
out_f.write(k+" ")
out_f.write("\n")