Skip to content
This repository has been archived by the owner on Jul 27, 2020. It is now read-only.

Refactor feed for stability #4

Merged
3 changes: 3 additions & 0 deletions atdisplanningalertsfeed.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ Gem::Specification.new do |spec|

spec.add_development_dependency "bundler", "~> 1.5"
spec.add_development_dependency "rake"
spec.add_development_dependency "rspec"
spec.add_development_dependency "vcr"
spec.add_development_dependency "webmock"

spec.add_runtime_dependency 'scraperwiki-morph'
spec.add_runtime_dependency 'atdis'
Expand Down
96 changes: 78 additions & 18 deletions lib/atdisplanningalertsfeed.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,84 @@
module ATDISPlanningAlertsFeed
def self.save(url, options = {})
feed = ATDIS::Feed.new(url)
logger = options[:logger]
logger ||= Logger.new(STDOUT)

options[:lodgement_date_start] = (options[:lodgement_date_start] || Date.today - 30)
options[:lodgement_date_end] = (options[:lodgement_date_end] || Date.today)
page = feed.applications(lodgement_date_start: options[:lodgement_date_start], lodgement_date_end: options[:lodgement_date_end])

# Save the first page
pages_processed = []
pages_processed << page.pagination.current if save_page(page)

while page = page.next_page
# Some ATDIS feeds incorrectly provide pagination
# and permit looping; so halt processing if we've already processed this page
unless pages_processed.index(page.pagination.current).nil?
puts "Page #{page.pagination.current} already processed; halting"
break
end

pages_processed << page.pagination.current if save_page(page)
# Grab all of the pages
pages = self.fetch_all_pages(feed, options, logger)

records = []
pages.each do |page|
additional_records = collect_records(page, logger)
# If there are no more records to fetch, halt processing
# regardless of pagination
break unless additional_records.any?
records += additional_records
end

self.persist_records(records, logger)
end

def self.save_page(page)
puts "Saving page #{page.pagination.current} of #{page.pagination.pages}"
private

def self.fetch_all_pages(feed, options, logger)
begin
page = feed.applications({
lodgement_date_start: options[:lodgement_date_start],
lodgement_date_end: options[:lodgement_date_end]
})
rescue RestClient::InternalServerError => e
# If the feed is known to be flakey, ignore the error
# on first fetch and assume the next run will pick this up
#
# Planningalerts itself will also notice if the median applications drops to 0
# over time
logger.error(e.message)
logger.debug(e.backtrace.join("\n"))
return [] if options[:flakey]
raise e
end

unless page.pagination && page.pagination.respond_to?(:current)
logger.warn("No/invalid pagination, assuming no records/aborting")
return []
end

pages = [page]
pages_processed = [page.pagination.current]
begin
while page = page.next_page
unless page.pagination && page.pagination.respond_to?(:current)
logger.warn("No/invalid pagination, assuming no records/aborting")
break
end

page.response.each do |item|
# Some ATDIS feeds incorrectly provide pagination
# and permit looping; so halt processing if we've already processed this page
unless pages_processed.index(page.pagination.current).nil?
logger.info("Page #{page.pagination.current} already processed; halting")
break
end
pages << page
pages_processed << page.pagination.current
logger.debug("Fetching #{page.next_url}")
end
rescue RestClient::InternalServerError => e
# Raise the exception unless this is known to be flakey
# allowing some processing of records to take place
logger.error(e.message)
logger.debug(e.backtrace.join("\n"))
raise e unless options[:flakey]
end

pages
end

def self.collect_records(page, logger)
page.response.collect do |item|
application = item.application

# TODO: Only using the first address because PA doesn't support multiple addresses right now
Expand All @@ -49,12 +103,18 @@ def self.save_page(page)
on_notice_from: (application.info.notification_start_date.to_date if application.info.notification_start_date),
on_notice_to: (application.info.notification_end_date.to_date if application.info.notification_end_date)
}
end
end

def self.persist_records(records, logger)
records.each do |record|
if (ScraperWikiMorph.select("* from data where `council_reference`='#{record[:council_reference]}'").empty? rescue true)
ScraperWikiMorph.save_sqlite([:council_reference], record)
else
puts "Skipping already saved record " + record[:council_reference]
logger.info "Skipping already saved record " + record[:council_reference]
end
end

records
end
end
85 changes: 85 additions & 0 deletions spec/atdis_planning_alerts_feed_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# TODO Shift to spec helper

$: << "#{File.dirname(__FILE__)}/.."
require 'atdisplanningalertsfeed'

Bundler.require :development, :test

require 'vcr'

VCR.configure do |c|
c.cassette_library_dir = 'spec/cassettes'
c.allow_http_connections_when_no_cassette = true
c.hook_into :webmock
c.default_cassette_options = { record: :new_episodes }
c.configure_rspec_metadata!
end


#
describe ATDISPlanningAlertsFeed, :vcr do
before :each do
@options = {
lodgement_date_start: Date.parse("2016-02-21"),
lodgement_date_end: Date.parse("2016-03-22")
}
end
context 'valid feed' do
it 'should not error on empty feed' do
records = ATDISPlanningAlertsFeed.save("http://mycouncil2.solorient.com.au/Horizon/@@horizondap_ashfield@@/atdis/1.0/", @options)

expect(records.length).to eq 0
end
end
context 'dodgy pagination' do
it 'should not error' do
records = ATDISPlanningAlertsFeed.save("https://myhorizon.maitland.nsw.gov.au/Horizon/@@horizondap@@/atdis/1.0/", @options)

expect(records.length).to eq 120
end
end

context 'really dodgy pagination' do
it 'should not error' do
records = ATDISPlanningAlertsFeed.save("https://da.kiama.nsw.gov.au/atdis/1.0/", @options)

expect(records.length).to eq 43
end
end

context 'with a flakey service' do
# TODO This spec should always force a RestClient::InternalServerError: 500 Internal Server Error
it 'should not error' do
@options.merge!({
flakey: true
})
# TODO This doesn't work as expected (stackleveltoodeep), but the VCR cassette should work
# allow_any_instance_of(ATDIS::Feed).to receive(:applications).and_raise(RestClient::InternalServerError.new("500 Internal Server Error"))

url ="http://myhorizon.cootamundra.nsw.gov.au/Horizon/@@horizondap@@/atdis/1.0/"
records = ATDISPlanningAlertsFeed.save(url, @options)

# TODO Expectation that a HTTP 500 on the first page recovers gracefully
expect(records.length).to eq 0
end

it 'should not error half way through processing' do
@options.merge!({
flakey: true
})

# TODO This doesn't work as expected
# But I have faked the response in the cassette
# allow_any_instance_of(ATDIS::Models::Page).to receive(:next_page).and_raise(RestClient::InternalServerError.new("500 Internal Server Error"))

# Yass isn't actually flakey, but Cootamundra is *too* flakey
# This scenario replicates one page of many having an unhandled exception (seen in Horizon DAP feeds)
url = "http://mycouncil.yass.nsw.gov.au/Horizon/@@horizondap@@/atdis/1.0/"
records = ATDISPlanningAlertsFeed.save(url, @options)

# TODO Expectation that a HTTP 500 on the second page still allows several errors to process
expect(records.length).to eq 20
end

end
end
Loading