Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jnu committed Oct 27, 2015
0 parents commit 97ff8ba
Show file tree
Hide file tree
Showing 12 changed files with 4,455 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.pyc
*.psd
*.bak
*.txt
.DS_Store
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Exploring the issue of capital punishment in Texas

http://sauce.joenoodles.com/deathrow/
501 changes: 501 additions & 0 deletions bios.csv

Large diffs are not rendered by default.

217 changes: 217 additions & 0 deletions bios_getter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#-*-coding: utf8-*-
'''
$ python bios_getter.py
Get and parse death penalty bio pages
from Texas Dept of Criminal Justice website.
Writes data to stdout as CSV, so pipe it into desired location.
Joe Nudell, 2013
'''

from bs4 import BeautifulSoup as bs
from nameparser import HumanName
from sys import argv, stderr, stdout, exit
from urlparse import urljoin
import os
import re
import urllib
import urllib2
import csv


source_url = "http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"

headers = [
'Date of execution',
'Execution #',
'Age (when executed)',
'First Name',
'Middle Name',
'Last Name',
'Name Suffix',
'TDCJ Number',
'Date of Birth',
'Date Received',
'Age (when received)',
'Education Level (highest grade completed)',
'Date of Offense',
'Age (at time of offense)',
'County',
'Race',
'Gender',
'Hair Color',
'Height',
'Weight',
'Eye Color',
'Native County',
'Native State',
'Prior Occupation',
'Prior Prison Record',
'Summary of Incident',
'Co-Defendants',
'Race and Gender of Victim',
'Headshot (base-64)',
'statement',
'Link to Offender Information',
'Link to Last Statement'
]




if __name__=='__main__':
# Get the main index page
print >>stderr, "Scraping Texas Dept. of Criminal Justice - Death Row Info"
print >>stderr, "\n\n"


print >>stderr, "Downloading main index ... ",
uh_main = urllib2.urlopen(source_url)
soup_index = bs(uh_main.read())
index_table = soup_index.find('table')
print >>stderr, "Done.\n"
print >>stderr, "Beginning scraping.\n"


# Find all rows on the page, skip first because it's headers
rows = index_table.find_all('tr')[1:]
max_ = len(rows)

out_file = csv.writer(stdout)

out_file.writerow(headers)

for i, row in enumerate(rows):
# Iterate over table rows, store info, grab links, download links
print >>stderr, "Reading row", i+1, "out of", max_, "..."

entry = []

cells = row.find_all('td')

# First - date of execution. In cell #7
entry.append(cells[7].text.encode('utf8'))

# Second - Execution #. Cell #0.
entry.append(cells[0].text.encode('utf8'))

# Third - Age when executed. In Cell #6.
entry.append(cells[6].text.encode('utf8'))

# Get links
bio_link = urljoin(source_url, cells[1].find('a')['href'])
statement_link = urljoin(source_url, cells[2].find('a')['href'])

last_name = cells[3].text.encode('utf8')
first_name = cells[4].text.encode('utf8')
_name = first_name + " " + last_name

# --- Download BIO link ---
print >>stderr, " Downloading bio for", _name, " ...",

yay_have_data = False

if bio_link.endswith('.html'):
# Bio available in HTML! Parse it.

uh = urllib2.urlopen(bio_link)
soup_bio = bs(uh.read())

print >>stderr, "Done."
print >>stderr, " Parsing bio ...",

main_table = soup_bio.find('table',
attrs={'class':'tabledata_deathrow_table'})

if main_table:
# Data is available!
yay_have_data = True

line_tmp = [tr.find('td',
attrs={'class':'tabledata_align_left_deathrow'})\
.text.encode('utf8')
for tr in main_table.find_all('tr')]
name = HumanName(line_tmp[0])
line = [name.first, name.middle, name.last, name.suffix] \
+ line_tmp[1:]

try:
image_link = urljoin(bio_link,
main_table.find('img')['src'])
except Exception as e:
print >>stderr, "[Error: image not available.", str(e),"]"
image_link = None

supp_info = []
for p in soup_bio.find_all('p')[1:]:
# This one is a problematic / ill-formed field.
try:
supp_info.append(p.find('span')\
.find_next('br').next_sibling.encode('utf8'))
except Exception as e:
print >>stderr, "[Error:", str(e), ", p=", str(p), "]",
supp_info.append("")

while len(supp_info)<5:
# Make sure supplemental info is five cells in length
supp_info.append("Not Available")

line += supp_info

if image_link:
# download image
print >>stderr, "fetching headshot ...",
uhimg = urllib2.urlopen(image_link)
img = urllib.quote(uhimg.read().encode('base64'))
else:
img = "Not Available"

line.append(img)

entry += line
print >>stderr, "Done."

if not yay_have_data:
# Frown, no data
# Get what's known from index page
print >>stderr, "Can't get bio. Falling back to index info ...",

entry += [first_name,"",last_name,"",cells[5].text.encode('utf8')]
entry += ["Not Available"]*6
entry += [c.text.encode('utf8') for c in [cells[9], cells[8]]]
entry += ["Not Available"]*13

print >>stderr, "Done."


# -- Download STATEMENT link ---
print >>stderr, " Downloading statement ...",
uh = urllib2.urlopen(statement_link)
soup_statement = bs(uh.read())
print >>stderr, "Done."

print >>stderr, " Parsing statement ...",

_title = soup_statement.find('p',text=re.compile(".*statement.*",re.I))
statement = "Unknown"
if _title:
s = _title.find_next('p')
if s:
statement = s.text.encode('utf8')

entry.append(statement)

print >>stderr, "Done."

print >>stderr, " Printing info ...",
entry = [c.strip() for c in entry]

entry += [bio_link, statement_link]

out_file.writerow(entry)
print >>stderr, "All done!"



105 changes: 105 additions & 0 deletions bios_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
$ python bios_parser.py raw_bios.csv > deathrow.csv
Condense raw bios into a more web-friendly CSV file.
In particular make it smaller and exclude any fields that are not
actually used in the visualization.
JN 2013
'''

from bios_getter import headers
from collections import defaultdict
from sys import argv, stderr, stdout, exit
import csv
import sentiment


clf = sentiment.train()

def classify_positive(n):
if 'declined to make' in n or len(n.strip())<1:
return "N/A"
return sentiment.classify(n, clf)['pos']


def normalize_name(n):
return n.title().strip();


headers_map = {
'Execution #' : ['Execution #'],
'Date of execution' : ['Execution Date'],
'Age (when executed)' : ['Age when Executed'],
'First Name' : ['First Name'],
'Last Name' : ['Last Name'],
'TDCJ Number' : ['Prisoner'],
'Age (when received)' : ['Age of Incarceration'],
'Education Level (highest grade completed)' : ['Education'],
'Race' : ['Race', normalize_name],
'Gender' : ['Gender', normalize_name],
'Prior Occupation' : ['Occupation'],
'statement' : [['Statement'], ['Sentiment', classify_positive]],
'Link to Offender Information' : ['Source of Bio'],
'Link to Last Statement' : ['Source of Statement']
}


if __name__=='__main__':
# Main loop

print >>stderr, "Starting ..."

if len(argv)!=2:
# Check arguments, make sure raw file is given
print >>stderr, "Wrong number of arguments."
print >>stderr, __doc__
exit(1)

with open(argv[1], 'r') as fh:
print >>stderr, "Opening files ..."
reader = csv.reader(fh)
writer = csv.writer(stdout)

good_cells = defaultdict(int)
norm = defaultdict(list)

print >>stderr, "Filtering raw data ...",
for entry in reader:
new_entry = []
if not good_cells:
# Read header
for i, cell in enumerate(entry):
if cell in headers_map:
if type(headers_map[cell][0]) is not list:
headers_map[cell] = [headers_map[cell]]

for newcol in headers_map[cell]:
good_cells[i]+=1
new_entry.append(newcol[0])

if len(newcol)==2:
# Remember normalization function to
norm[i].append(newcol[1])
else:
norm[i].append(None)
else:
# Read entry, whitelisting cells in good_cells
for i, cell in enumerate(entry):
#print >>stderr, good_cells[i]
#print >>stderr, norm[i]
for x in range(good_cells[i]):
nc = cell

if norm[i][x] is not None:
# Normalize value
nc = norm[i][x](nc)

new_entry.append(nc)

# Write CSV line to stdout
writer.writerow(new_entry)
print >>stderr, "Done."

print >>stderr, "Success!"
Loading

0 comments on commit 97ff8ba

Please sign in to comment.