Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrong app instance state returned by process stats endpoint during graceful shutdown #3834

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Name | Type | Description
---- | ---- | -----------
**type** | _string_ | Process type; a unique identifier for processes belonging to an app
**index** | _integer_ | The zero-based index of running instances
**state** | _string_ | The state of the instance; valid values are `RUNNING`, `CRASHED`, `STARTING`, `DOWN`
**state** | _string_ | The state of the instance; valid values are `RUNNING`, `CRASHED`, `STARTING`, `STOPPING`, `DOWN`
**routable** | _boolean_ | Whether or not the instance is routable (determined by the readiness check of the app). If app readiness checks and routability are unsupported by Diego, this will return as `null`.
**usage** | _object_ | Object containing actual usage data for the instance; the value is `{}` when usage data is unavailable
**usage.time** | _[timestamp](#timestamps)_ | The time when the usage was requested
Expand Down
1 change: 1 addition & 0 deletions lib/cloud_controller/diego/constants.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ module Diego

LRP_STARTING = 'STARTING'.freeze
LRP_RUNNING = 'RUNNING'.freeze
LRP_STOPPING = 'STOPPING'.freeze
LRP_CRASHED = 'CRASHED'.freeze
LRP_DOWN = 'DOWN'.freeze
LRP_UNKNOWN = 'UNKNOWN'.freeze
Expand Down
98 changes: 63 additions & 35 deletions lib/cloud_controller/diego/reporters/instances_stats_reporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,62 +12,90 @@ def initialize(bbs_instances_client, logstats_client)
end

def stats_for_app(process)
result = {}
current_time = Time.now.to_f
formatted_current_time = Time.now.to_datetime.rfc3339

logger.debug('stats_for_app.fetching_container_metrics', process_guid: process.guid)
desired_lrp = bbs_instances_client.desired_lrp_instance(process)

log_cache_errors, stats, quota_stats, isolation_segment = get_stats(desired_lrp, process)

actual_lrp_info(process, stats, quota_stats, log_cache_errors, isolation_segment)
rescue CloudController::Errors::NoRunningInstances
handle_no_running_instances(process)
rescue StandardError => e
logger.error('stats_for_app.error', error: e.to_s)
raise e if e.is_a?(CloudController::Errors::ApiError) && e.name == 'ServiceUnavailable'

exception = CloudController::Errors::InstancesUnavailable.new(e.message)
exception.set_backtrace(e.backtrace)
raise exception
end

private

attr_reader :bbs_instances_client

def get_stats(desired_lrp, process)
log_cache_data, log_cache_errors = envelopes(desired_lrp, process)
stats = formatted_process_stats(log_cache_data, formatted_current_time)
stats = formatted_process_stats(log_cache_data, Time.now.to_datetime.rfc3339)
quota_stats = formatted_quota_stats(log_cache_data)
isolation_segment = desired_lrp.PlacementTags.first
[log_cache_errors, stats, quota_stats, isolation_segment]
end

# rubocop:disable Metrics/ParameterLists
def actual_lrp_info(process, stats=nil, quota_stats=nil, log_cache_errors=nil, isolation_segment=nil, state=nil)
# rubocop:enable Metrics/ParameterLists
result = {}
bbs_instances_client.lrp_instances(process).each do |actual_lrp|
index = actual_lrp.actual_lrp_key.index
next unless index < process.instances

info = {
state: LrpStateTranslator.translate_lrp_state(actual_lrp),
isolation_segment: desired_lrp.PlacementTags.first,
stats: {
name: process.name,
uris: process.uris,
host: actual_lrp.actual_lrp_net_info.address,
port: get_default_port(actual_lrp.actual_lrp_net_info),
net_info: actual_lrp.actual_lrp_net_info.to_h,
uptime: nanoseconds_to_seconds((current_time * 1e9) - actual_lrp.since),
fds_quota: process.file_descriptors
}.merge(metrics_data_for_instance(stats, quota_stats, log_cache_errors, formatted_current_time, index))
}
info[:details] = actual_lrp.placement_error if actual_lrp.placement_error.present?
next unless actual_lrp.actual_lrp_key.index < process.instances

info[:routable] = (actual_lrp.routable if actual_lrp.optional_routable)
state ||= LrpStateTranslator.translate_lrp_state(actual_lrp)

info = build_info(state, actual_lrp, process, stats, quota_stats, log_cache_errors)
info[:isolation_segment] = isolation_segment unless isolation_segment.nil?
result[actual_lrp.actual_lrp_key.index] = info
end

fill_unreported_instances_with_down_instances(result, process, flat: false)

warnings = [log_cache_errors].compact
[result, warnings]
end

def build_info(state, actual_lrp, process, stats, quota_stats, log_cache_errors)
info = {
state: state,
stats: {
name: process.name,
uris: process.uris,
host: actual_lrp.actual_lrp_net_info.address,
port: get_default_port(actual_lrp.actual_lrp_net_info),
net_info: actual_lrp.actual_lrp_net_info.to_h,
uptime: nanoseconds_to_seconds((Time.now.to_f * 1e9) - actual_lrp.since),
fds_quota: process.file_descriptors
}.merge(metrics_data_for_instance(stats, quota_stats, log_cache_errors, Time.now.to_datetime.rfc3339, actual_lrp.actual_lrp_key.index))
}
info[:details] = actual_lrp.placement_error if actual_lrp.placement_error.present?

info[:routable] = (actual_lrp.routable if actual_lrp.optional_routable)
info
end

def handle_no_running_instances(process)
# case when no actual_lrp exists
if bbs_instances_client.lrp_instances(process).empty?
[fill_unreported_instances_with_down_instances({}, process, flat: false), []]
else
# case when no desired_lrp exists but an actual_lrp
logger.debug('Actual LRP found, setting state to STOPPING', process_guid: process.guid)
actual_lrp_info(process, nil, nil, nil, nil, VCAP::CloudController::Diego::LRP_STOPPING)
end
rescue CloudController::Errors::NoRunningInstances => e
logger.info('stats_for_app.error', error: e.to_s)
[fill_unreported_instances_with_down_instances({}, process, flat: false), []]
rescue StandardError => e
logger.error('stats_for_app.error', error: e.to_s)
raise e if e.is_a?(CloudController::Errors::ApiError) && e.name == 'ServiceUnavailable'

exception = CloudController::Errors::InstancesUnavailable.new(e.message)
exception.set_backtrace(e.backtrace)
raise exception
end

private

attr_reader :bbs_instances_client

def metrics_data_for_instance(stats, quota_stats, log_cache_errors, formatted_current_time, index)
if log_cache_errors.blank?
if !stats.nil? && log_cache_errors.blank?
{
mem_quota: quota_stats[index]&.memory_bytes_quota,
disk_quota: quota_stats[index]&.disk_bytes_quota,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,81 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:)
expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []])
end

context 'when a NoRunningInstances error is thrown for desired_lrp and it exists an actual_lrp' do
let(:error) { CloudController::Errors::NoRunningInstances.new('No running instances ruh roh') }
let(:expected_stopping_response) do
{
0 => {
state: 'STOPPING',
routable: is_routable,
stats: {
name: process.name,
uris: process.uris,
host: 'lrp-host',
port: 2222,
net_info: lrp_1_net_info.to_h,
uptime: two_days_in_seconds,
mem_quota: nil,
disk_quota: nil,
log_rate_limit: nil,
fds_quota: process.file_descriptors,
usage: {}
},
details: 'some-details'
}
}
end
let(:bbs_actual_lrps_response) { [actual_lrp_1] }
let(:lrp_1_net_info) do
::Diego::Bbs::Models::ActualLRPNetInfo.new(
address: 'lrp-host',
ports: [
::Diego::Bbs::Models::PortMapping.new(container_port: DEFAULT_APP_PORT, host_port: 2222),
::Diego::Bbs::Models::PortMapping.new(container_port: 1111)
]
)
end
let(:actual_lrp_1) do
make_actual_lrp(
instance_guid: 'instance-a', index: 0, state: ::Diego::ActualLRPState::RUNNING, error: 'some-details', since: two_days_ago_since_epoch_ns
).tap do |actual_lrp|
actual_lrp.actual_lrp_net_info = lrp_1_net_info
end
end

before do
allow(bbs_instances_client).to receive_messages(lrp_instances: bbs_actual_lrps_response)
allow(bbs_instances_client).to receive(:desired_lrp_instance).with(process).and_raise(error)
end

it 'shows all instances as "STOPPING" state' do
expect(instances_reporter.stats_for_app(process)).to eq([expected_stopping_response, []])
end
end

context 'when a NoRunningInstances error is thrown for desired_lrp and it does not exist an actual_lrp' do
let(:error) { CloudController::Errors::NoRunningInstances.new('No running instances ruh roh') }
let(:expected_stats_response) do
{
0 => {
state: 'DOWN',
stats: {
uptime: 0
}
}
}
end

before do
allow(bbs_instances_client).to receive(:lrp_instances).with(process).and_raise(error)
allow(bbs_instances_client).to receive(:desired_lrp_instance).with(process).and_raise(error)
end

it 'shows all instances as "DOWN" state' do
expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []])
end
end

context 'when process is not routable' do
let(:is_routable) { false }

Expand Down Expand Up @@ -281,7 +356,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:)
end
end

context 'when a NoRunningInstances error is thrown' do
context 'when a NoRunningInstances error is thrown for actual_lrp and it exists a desired_lrp' do
let(:error) { CloudController::Errors::NoRunningInstances.new('No running instances ruh roh') }
let(:expected_stats_response) do
{
Expand All @@ -293,9 +368,16 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:)
}
}
end
let(:bbs_desired_lrp_response) do
::Diego::Bbs::Models::DesiredLRP.new(
PlacementTags: placement_tags,
metric_tags: metrics_tags
)
end

before do
allow(bbs_instances_client).to receive(:lrp_instances).with(process).and_raise(error)
allow(bbs_instances_client).to receive_messages(desired_lrp_instance: bbs_desired_lrp_response)
end

it 'shows all instances as "DOWN"' do
Expand Down