-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcrawl_cnnvd.py
156 lines (134 loc) · 6.38 KB
/
crawl_cnnvd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: crawl_cnnvd
import re
import json
from pyspider.libs.base_handler import *
from pyquery.pyquery import PyQuery
from config import Config
import cve_detail
from normalized import fuzzyfinder
class Handler(BaseHandler):
"""
爬取cnvd漏洞库并解析所需字段
"""
vendor_dict = Config.vendor_dict
crawl_config = {
"headers":{
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36"
}
}
url_list = [
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Siemens/vulcode/Siemens/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Schneider/vulcode/Schneider/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Rockwell/vulcode/Rockwell/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Yokogawa/vulcode/Yokogawa/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Mitsubishi/vulcode/Mitsubishi/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Omron/vulcode/Omron/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Advantech/vulcode/Advantech/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/Emerson/vulcode/Emerson/cnnvdid//fbsjs//fbsje/',
'http://www.cnnvd.org.cn/vulnerability/index/vulcode2/General%20Electric/vulcode/General%20Electric/cnnvdid//fbsjs//fbsje/'
]
@every(minutes=24 * 60)
def on_start(self):
"""根据url_list启动爬虫,并将相应的结果传递给index_page进行解析"""
self.crawl(self.url_list, retries=10, callback=self.index_page)
@config(age= 24 * 60 * 60)
def index_page(self, response):
"""获取所有漏洞url,并将相应的url相应传递给detail_page"""
for each in response.doc('a[href^="http"]').items():
if re.match("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-\d+-\d+",each.attr.href):
print each.attr.href
self.crawl(each.attr.href, priority=9, retries=10, callback=self.detail_page)
self.crawl(response.doc(".dispage >a").filter(lambda i:PyQuery(this).text() == u"下一页").attr.href, retries=10, callback=self.index_page)
def detail_page(self, response):
"""解析漏洞详细信息"""
leak_info = response.doc(".details tr")
vul_name = leak_info.eq(0).children().eq(1).text()
cnnvd_id = leak_info.eq(1).children().eq(1).text()
release_time = leak_info.eq(2).children().eq(1).text()
update_time = leak_info.eq(3).children().eq(1).text()
danger_level = leak_info.eq(4).children().eq(1).text()
cvss_score = ''
vul_type = leak_info.eq(5).children().eq(1).text()
if vul_type is None or vul_type == '':
vul_type = u'其他类型'
attack_path = leak_info.eq(6).children().eq(1).text()
cve_id = leak_info.eq(7).children().eq(1).text()
finder = leak_info.eq(8).children().eq(1).text()
if not finder:
finder = 'unknown'
# 解析漏洞描述
vul_des = ''
description_list = response.doc(".cont_details")[1].xpath(".//text()")
for item in description_list:
if item:
vul_des += item.strip()
vul_solution = response.doc(".cont_details").eq(2).text().strip()
ref_link = ''
reference_link_list = response.doc("#top3 p a")
tmp_link = []
for rlink in reference_link_list:
if PyQuery(rlink).text().startswith(u'http'):
tmp_link.append(PyQuery(rlink).outerHtml())
ref_link = ';'.join(tmp_link)
affect_product = ''
impact_product_list= response.doc(".rht_cont span")
tmp_product =[]
for product in impact_product_list:
tmp_product.append(PyQuery(product).attr.title)
affect_product = ';'.join(tmp_product)
if affect_product == u'暂无数据':
affect_product = ''
vul_exploit = ''
vul_status = ''
full_text = (vul_name +affect_product + vul_des).lower()
# 解析产品厂商
affect_vendor = 'other'
# if cve_id:
# product_detail = cve_detail.crawl_cve_detail(cve_id)
# if product_detail:
# affect_vendor = ','.join(product_detail.keys())
# else:
# for define_vendor,ref_vendor in self.vendor_dict.items():
# for rv in ref_vendor:
# if fuzzyfinder(rv, full_text):
# affect_vendor = define_vendor
# break
# if affect_vendor != 'other':
# break
# else:
# for define_vendor,ref_vendor in self.vendor_dict.items():
# for rv in ref_vendor:
# if fuzzyfinder(rv, full_text):
# affect_vendor = define_vendor
# break
# if affect_vendor != 'other':
# break
# if affect_vendor:
# normalized_vendor = affect_vendor + '(' + affect_vendor + ')' # default vendor
# for define_vendor, ref_vendor in self.vendor_dict.items():
# if define_vendor.lower() in affect_vendor.lower():
# normalized_vendor = ref_vendor[-1] + '(' + define_vendor.title() + ')'
# break
result = {
"vul_enname": '',
"vul_chname": vul_name,
"cnnvd_id": cnnvd_id,
"cve_id":cve_id,
"vul_type": vul_type,
"danger_level": danger_level,
"cvss_score": cvss_score,
"attack_path": attack_path,
"vul_des": vul_des,
"affect_vendor": affect_vendor,
"affect_product": affect_product,
"vul_exploit": vul_exploit,
"vul_solution": vul_solution,
"ref_link": ref_link,
"vul_status": vul_status,
"finder": finder,
"release_time": release_time,
"update_time": update_time
}
return result