forked from openaustralia/openaustralia-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-shadow-ministers.rb
executable file
·82 lines (73 loc) · 2.89 KB
/
parse-shadow-ministers.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env ruby
# Used to be that we generated data/shadow-ministers.csv by hand
# It turned into "just that little bit too much of a headache to maintain" so trying to automate it by
# scraping pages on the parliamentary website
$:.unshift "#{File.dirname(__FILE__)}/lib"
require 'mechanize_proxy'
require 'hpricot'
require 'name'
def simplify_whitespace(str)
str.gsub(/[\n\t\r]/, " ").squeeze(" ").strip
end
def extract_positions_from_page(url)
agent = MechanizeProxy.new
page = agent.get(url)
# Main content section - <sarcasm>look how elegant this is when you do use tables for layouts</sarcasm>
content = page.search('table')[1].search('td')[2]
# Loop through all the elements and do our best to interpret each bit
shadows = []
start_date = nil
content.children.each do |c|
case c.inner_text.strip
when "", "Current Parliamentary Information", /The \d+(nd|rd|th) Parliament/, "Shadow Ministry", "Outer Shadow Ministry",
"Shadow Parliamentary Secretaries"
# Skip this - do nothing
when /(.*) -/
start_date = Date.parse($~[1])
when "Previous Shadow Ministries"
# The rest are just links off to the other pages. let's ignore these for the time being
break
else
# Get rid of bits of html that get in the way
['strong', 'a', 'b'].each do |search|
c.search(search).each do |s|
s.swap s.children.join
end
end
# At this point forward we're assuming this is one or more positions followed by a name
lines = simplify_whitespace(c.inner_html).split('<br />').map {|t| t.strip}
# If there are not enough lines just ignore
if lines.size >= 2
positions = lines[0..-2]
name = lines[-1]
name = Name.title_first_last(lines[-1])
positions.each do |position|
shadows << {:name => name, :start_date => start_date, :end_date => nil, :positions => positions}
end
end
end
end
shadows
end
shadows = extract_positions_from_page("http://www.aph.gov.au/library/parl/42/Shadow/")
shadows.each do |shadow|
name = shadow[:name]
start_date = shadow[:start_date]
end_date = shadow[:end_date]
positions = shadow[:positions]
if name.post_title == "MP"
# Reformat names if they're an MP
name = Name.new(:first => name.first, :middle => name.middle, :last => name.last)
elsif name.title =~ /^Senator/
# Use Senator for everyone even if they're "Senator the Hon."
name = Name.new(:title => "Senator", :first => name.first, :middle => name.middle, :last => name.last)
# Handle senators slightly differently
else
raise "Unknown type of person"
end
start_date_string = "#{start_date.day}/#{start_date.month}/#{start_date.year}"
end_date_string = "#{end_date.day}/#{end_date.month}/#{end_date.year}" if end_date
positions.each do |position|
puts "#{name.full_name},#{start_date_string},#{end_date_string},\"#{position}\""
end
end