-
Notifications
You must be signed in to change notification settings - Fork 0
/
YRBSS_Rip.py
56 lines (44 loc) · 1.81 KB
/
YRBSS_Rip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
__author__ = 'arosado'
import io
import re
import lxml
import pycurl
class YRBSSRipper:
#Initalize buffers and parsing objects
currentBuffer = io.BytesIO()
curlObject = pycurl.Curl()
currentCookies = []
pastCookies = []
currentHTML = lxml
currentRE = re
# #Urls of interest
# displayUrl = 'http://apps.nccd.cdc.gov/brfss/display.asp?'
# selQuesUrl = 'http://apps.nccd.cdc.gov/BRFSS-SMART/SelQuestion.asp?'
# quesPageUrl = 'http://apps.nccd.cdc.gov/brfss/page.asp?'
# listMMSAQuestUrl = 'http://apps.nccd.cdc.gov/BRFSS-SMART/ListMMSAQuest.asp?'
# yearsUrl = 'http://apps.nccd.cdc.gov/brfss/years.asp?'
# SelMMSAPrevDataUrl = 'http://apps.nccd.cdc.gov/BRFSS-SMART/SelMMSAPrevData.asp?'
# #qkeyDetermineUrl = 'http://apps.nccd.cdc.gov/brfss/display.asp?cat=AC&yr=2012&state=US&qkey='
YRBSSUrl = ''
# def iterqkey(self):
# i = 0
# while(i < 10000):
# iterUrl = self.qkeyDetermineUrl + str(i)
# self.selectBRFSSUrl(iterUrl)
# self.returnCurrentBrfssUrlHTML()
def returnCurrentBrfssUrlHTML(self):
self.curlObject.perform()
return self.currentBuffer.getbuffer()
def grabCurrentBRFSSInformation(self):
pass
#Configures curl object for new url to write to object buffer
def selectBRFSSUrl(self, brfssUrl):
self.curlObject.setopt(pycurl.URL, brfssUrl)
self.curlObject.setopt(pycurl.HTTPHEADER, ["Accept:"])
self.curlObject.setopt(pycurl.WRITEFUNCTION, self.currentBuffer.write)
self.curlObject.setopt(pycurl.FOLLOWLOCATION, 1)
self.curlObject.setopt(pycurl.MAXREDIRS, 10)
self.curlObject.setopt(pycurl.COOKIEFILE, 'cookie.txt')
self.currentCookies = self.curlObject.getinfo(pycurl.INFO_COOKIELIST)
def __init__(self):
pass