-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 97ff8ba
Showing
12 changed files
with
4,455 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
*.pyc | ||
*.psd | ||
*.bak | ||
*.txt | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Exploring the issue of capital punishment in Texas | ||
|
||
http://sauce.joenoodles.com/deathrow/ |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
#-*-coding: utf8-*- | ||
''' | ||
$ python bios_getter.py | ||
Get and parse death penalty bio pages | ||
from Texas Dept of Criminal Justice website. | ||
Writes data to stdout as CSV, so pipe it into desired location. | ||
Joe Nudell, 2013 | ||
''' | ||
|
||
from bs4 import BeautifulSoup as bs | ||
from nameparser import HumanName | ||
from sys import argv, stderr, stdout, exit | ||
from urlparse import urljoin | ||
import os | ||
import re | ||
import urllib | ||
import urllib2 | ||
import csv | ||
|
||
|
||
source_url = "http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html" | ||
|
||
headers = [ | ||
'Date of execution', | ||
'Execution #', | ||
'Age (when executed)', | ||
'First Name', | ||
'Middle Name', | ||
'Last Name', | ||
'Name Suffix', | ||
'TDCJ Number', | ||
'Date of Birth', | ||
'Date Received', | ||
'Age (when received)', | ||
'Education Level (highest grade completed)', | ||
'Date of Offense', | ||
'Age (at time of offense)', | ||
'County', | ||
'Race', | ||
'Gender', | ||
'Hair Color', | ||
'Height', | ||
'Weight', | ||
'Eye Color', | ||
'Native County', | ||
'Native State', | ||
'Prior Occupation', | ||
'Prior Prison Record', | ||
'Summary of Incident', | ||
'Co-Defendants', | ||
'Race and Gender of Victim', | ||
'Headshot (base-64)', | ||
'statement', | ||
'Link to Offender Information', | ||
'Link to Last Statement' | ||
] | ||
|
||
|
||
|
||
|
||
if __name__=='__main__': | ||
# Get the main index page | ||
print >>stderr, "Scraping Texas Dept. of Criminal Justice - Death Row Info" | ||
print >>stderr, "\n\n" | ||
|
||
|
||
print >>stderr, "Downloading main index ... ", | ||
uh_main = urllib2.urlopen(source_url) | ||
soup_index = bs(uh_main.read()) | ||
index_table = soup_index.find('table') | ||
print >>stderr, "Done.\n" | ||
print >>stderr, "Beginning scraping.\n" | ||
|
||
|
||
# Find all rows on the page, skip first because it's headers | ||
rows = index_table.find_all('tr')[1:] | ||
max_ = len(rows) | ||
|
||
out_file = csv.writer(stdout) | ||
|
||
out_file.writerow(headers) | ||
|
||
for i, row in enumerate(rows): | ||
# Iterate over table rows, store info, grab links, download links | ||
print >>stderr, "Reading row", i+1, "out of", max_, "..." | ||
|
||
entry = [] | ||
|
||
cells = row.find_all('td') | ||
|
||
# First - date of execution. In cell #7 | ||
entry.append(cells[7].text.encode('utf8')) | ||
|
||
# Second - Execution #. Cell #0. | ||
entry.append(cells[0].text.encode('utf8')) | ||
|
||
# Third - Age when executed. In Cell #6. | ||
entry.append(cells[6].text.encode('utf8')) | ||
|
||
# Get links | ||
bio_link = urljoin(source_url, cells[1].find('a')['href']) | ||
statement_link = urljoin(source_url, cells[2].find('a')['href']) | ||
|
||
last_name = cells[3].text.encode('utf8') | ||
first_name = cells[4].text.encode('utf8') | ||
_name = first_name + " " + last_name | ||
|
||
# --- Download BIO link --- | ||
print >>stderr, " Downloading bio for", _name, " ...", | ||
|
||
yay_have_data = False | ||
|
||
if bio_link.endswith('.html'): | ||
# Bio available in HTML! Parse it. | ||
|
||
uh = urllib2.urlopen(bio_link) | ||
soup_bio = bs(uh.read()) | ||
|
||
print >>stderr, "Done." | ||
print >>stderr, " Parsing bio ...", | ||
|
||
main_table = soup_bio.find('table', | ||
attrs={'class':'tabledata_deathrow_table'}) | ||
|
||
if main_table: | ||
# Data is available! | ||
yay_have_data = True | ||
|
||
line_tmp = [tr.find('td', | ||
attrs={'class':'tabledata_align_left_deathrow'})\ | ||
.text.encode('utf8') | ||
for tr in main_table.find_all('tr')] | ||
name = HumanName(line_tmp[0]) | ||
line = [name.first, name.middle, name.last, name.suffix] \ | ||
+ line_tmp[1:] | ||
|
||
try: | ||
image_link = urljoin(bio_link, | ||
main_table.find('img')['src']) | ||
except Exception as e: | ||
print >>stderr, "[Error: image not available.", str(e),"]" | ||
image_link = None | ||
|
||
supp_info = [] | ||
for p in soup_bio.find_all('p')[1:]: | ||
# This one is a problematic / ill-formed field. | ||
try: | ||
supp_info.append(p.find('span')\ | ||
.find_next('br').next_sibling.encode('utf8')) | ||
except Exception as e: | ||
print >>stderr, "[Error:", str(e), ", p=", str(p), "]", | ||
supp_info.append("") | ||
|
||
while len(supp_info)<5: | ||
# Make sure supplemental info is five cells in length | ||
supp_info.append("Not Available") | ||
|
||
line += supp_info | ||
|
||
if image_link: | ||
# download image | ||
print >>stderr, "fetching headshot ...", | ||
uhimg = urllib2.urlopen(image_link) | ||
img = urllib.quote(uhimg.read().encode('base64')) | ||
else: | ||
img = "Not Available" | ||
|
||
line.append(img) | ||
|
||
entry += line | ||
print >>stderr, "Done." | ||
|
||
if not yay_have_data: | ||
# Frown, no data | ||
# Get what's known from index page | ||
print >>stderr, "Can't get bio. Falling back to index info ...", | ||
|
||
entry += [first_name,"",last_name,"",cells[5].text.encode('utf8')] | ||
entry += ["Not Available"]*6 | ||
entry += [c.text.encode('utf8') for c in [cells[9], cells[8]]] | ||
entry += ["Not Available"]*13 | ||
|
||
print >>stderr, "Done." | ||
|
||
|
||
# -- Download STATEMENT link --- | ||
print >>stderr, " Downloading statement ...", | ||
uh = urllib2.urlopen(statement_link) | ||
soup_statement = bs(uh.read()) | ||
print >>stderr, "Done." | ||
|
||
print >>stderr, " Parsing statement ...", | ||
|
||
_title = soup_statement.find('p',text=re.compile(".*statement.*",re.I)) | ||
statement = "Unknown" | ||
if _title: | ||
s = _title.find_next('p') | ||
if s: | ||
statement = s.text.encode('utf8') | ||
|
||
entry.append(statement) | ||
|
||
print >>stderr, "Done." | ||
|
||
print >>stderr, " Printing info ...", | ||
entry = [c.strip() for c in entry] | ||
|
||
entry += [bio_link, statement_link] | ||
|
||
out_file.writerow(entry) | ||
print >>stderr, "All done!" | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
''' | ||
$ python bios_parser.py raw_bios.csv > deathrow.csv | ||
Condense raw bios into a more web-friendly CSV file. | ||
In particular make it smaller and exclude any fields that are not | ||
actually used in the visualization. | ||
JN 2013 | ||
''' | ||
|
||
from bios_getter import headers | ||
from collections import defaultdict | ||
from sys import argv, stderr, stdout, exit | ||
import csv | ||
import sentiment | ||
|
||
|
||
clf = sentiment.train() | ||
|
||
def classify_positive(n): | ||
if 'declined to make' in n or len(n.strip())<1: | ||
return "N/A" | ||
return sentiment.classify(n, clf)['pos'] | ||
|
||
|
||
def normalize_name(n): | ||
return n.title().strip(); | ||
|
||
|
||
headers_map = { | ||
'Execution #' : ['Execution #'], | ||
'Date of execution' : ['Execution Date'], | ||
'Age (when executed)' : ['Age when Executed'], | ||
'First Name' : ['First Name'], | ||
'Last Name' : ['Last Name'], | ||
'TDCJ Number' : ['Prisoner'], | ||
'Age (when received)' : ['Age of Incarceration'], | ||
'Education Level (highest grade completed)' : ['Education'], | ||
'Race' : ['Race', normalize_name], | ||
'Gender' : ['Gender', normalize_name], | ||
'Prior Occupation' : ['Occupation'], | ||
'statement' : [['Statement'], ['Sentiment', classify_positive]], | ||
'Link to Offender Information' : ['Source of Bio'], | ||
'Link to Last Statement' : ['Source of Statement'] | ||
} | ||
|
||
|
||
if __name__=='__main__': | ||
# Main loop | ||
|
||
print >>stderr, "Starting ..." | ||
|
||
if len(argv)!=2: | ||
# Check arguments, make sure raw file is given | ||
print >>stderr, "Wrong number of arguments." | ||
print >>stderr, __doc__ | ||
exit(1) | ||
|
||
with open(argv[1], 'r') as fh: | ||
print >>stderr, "Opening files ..." | ||
reader = csv.reader(fh) | ||
writer = csv.writer(stdout) | ||
|
||
good_cells = defaultdict(int) | ||
norm = defaultdict(list) | ||
|
||
print >>stderr, "Filtering raw data ...", | ||
for entry in reader: | ||
new_entry = [] | ||
if not good_cells: | ||
# Read header | ||
for i, cell in enumerate(entry): | ||
if cell in headers_map: | ||
if type(headers_map[cell][0]) is not list: | ||
headers_map[cell] = [headers_map[cell]] | ||
|
||
for newcol in headers_map[cell]: | ||
good_cells[i]+=1 | ||
new_entry.append(newcol[0]) | ||
|
||
if len(newcol)==2: | ||
# Remember normalization function to | ||
norm[i].append(newcol[1]) | ||
else: | ||
norm[i].append(None) | ||
else: | ||
# Read entry, whitelisting cells in good_cells | ||
for i, cell in enumerate(entry): | ||
#print >>stderr, good_cells[i] | ||
#print >>stderr, norm[i] | ||
for x in range(good_cells[i]): | ||
nc = cell | ||
|
||
if norm[i][x] is not None: | ||
# Normalize value | ||
nc = norm[i][x](nc) | ||
|
||
new_entry.append(nc) | ||
|
||
# Write CSV line to stdout | ||
writer.writerow(new_entry) | ||
print >>stderr, "Done." | ||
|
||
print >>stderr, "Success!" |
Oops, something went wrong.