add jobs cluster tag

refs AE-119

Change-Id: I1b2f903a60793459feca2c0ef4d9eed9df12b508
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/309463
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
Reviewed-by: Isaac Moore <isaac.moore@instructure.com>
QA-Review: Aaron Ogata <aogata@instructure.com>
Product-Review: Aaron Ogata <aogata@instructure.com>
This commit is contained in:
Aaron Ogata 2023-01-24 10:39:15 -08:00
parent feb8286b0e
commit ca7fa755d4
5 changed files with 75 additions and 19 deletions

View File

@ -173,14 +173,13 @@ class InfoController < ApplicationController
} }
end end
def readiness(is_deep_check: false) def readiness
# This action provides a clear signal for assessing system components that are "owned" # This action provides a clear signal for assessing system components that are "owned"
# by Canvas and are ultimately responsible for being alive and able to serve consumer traffic # by Canvas and are ultimately responsible for being alive and able to serve consumer traffic
components = HealthChecks.process_readiness_checks(is_deep_check) components = HealthChecks.process_readiness_checks(false)
failed = components.reject { |_k, v| v[:status] }.map(&:first) render_readiness_json(components, false)
render_readiness_json(components, failed.any? ? 503 : 200, is_deep_check)
end end
def deep def deep
@ -198,7 +197,10 @@ class InfoController < ApplicationController
private private
def render_readiness_json(components, status_code, is_deep_check) def render_readiness_json(components, is_deep_check)
failed = components.reject { |_k, v| v[:status] }.map(&:first)
status_code = failed.any? ? 503 : 200
readiness_json = { status: status_code, components: components_to_hash(components) } readiness_json = { status: status_code, components: components_to_hash(components) }
return readiness_json if is_deep_check return readiness_json if is_deep_check
@ -206,14 +208,24 @@ class InfoController < ApplicationController
end end
def render_deep_json(critical, secondary, status_code) def render_deep_json(critical, secondary, status_code)
readiness_response = readiness(is_deep_check: true) components = HealthChecks.process_readiness_checks(true)
readiness_response = render_readiness_json(components, true)
status = readiness_response[:status] == 503 ? readiness_response[:status] : status_code status = readiness_response[:status] == 503 ? readiness_response[:status] : status_code
response = {
readiness: components,
critical: critical,
secondary: secondary,
}
HealthChecks.send_to_statsd(response, { cluster: Shard.current.database_server_id })
render json: { render json: {
status: status, status: status,
readiness: readiness_response, readiness: readiness_response,
critical: components_to_hash(critical), critical: components_to_hash(critical),
secondary: components_to_hash(secondary) secondary: components_to_hash(secondary),
}, status: status }, status: status
end end

View File

@ -354,13 +354,4 @@ Rails.configuration.after_initialize do
singleton: "Canvas::LiveEvents#heartbeat" } singleton: "Canvas::LiveEvents#heartbeat" }
) )
end end
Delayed::Periodic.cron "HealthChecks.send_to_statsd", "* * * * *" do
DatabaseServer.send_in_each_region(
HealthChecks,
:send_to_statsd,
{ run_current_region_asynchronously: true,
singleton: "HealthChecks#send_to_statsd" }
)
end
end end

View File

@ -38,12 +38,12 @@ module HealthChecks
{ critical: critical, secondary: secondary } { critical: critical, secondary: secondary }
end end
def send_to_statsd def send_to_statsd(result = nil, additional_tags = {})
result = process_deep_checks.merge({ readiness: process_readiness_checks(true) }) result ||= process_deep_checks.merge({ readiness: process_readiness_checks(true) })
result.each do |check_type, check_values| result.each do |check_type, check_values|
check_values.each do |check_name, check_results| check_values.each do |check_name, check_results|
tags = { type: check_type, key: check_name } tags = { type: check_type, key: check_name, **additional_tags }
InstStatsd::Statsd.timing("canvas.health_checks.response_time_ms", check_results[:time], tags: tags) InstStatsd::Statsd.timing("canvas.health_checks.response_time_ms", check_results[:time], tags: tags)
InstStatsd::Statsd.gauge("canvas.health_checks.status", check_results[:status] ? 1 : 0, tags: tags) InstStatsd::Statsd.gauge("canvas.health_checks.status", check_results[:status] ? 1 : 0, tags: tags)

View File

@ -181,6 +181,16 @@ describe InfoController do
expect(json["readiness"]["components"].count).to be > 0 expect(json["readiness"]["components"].count).to be > 0
end end
it "reports to statsd upon loading the deep endpoint" do
allow(InstStatsd::Statsd).to receive(:gauge)
allow(InstStatsd::Statsd).to receive(:timing)
allow(Shard.current).to receive(:database_server_id).and_return("C1")
get "deep"
expect(response).to be_successful
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 1, tags: { type: :readiness, key: :common_css, cluster: "C1" })
end
it "responds with 503 if a readiness system component is considered down" do it "responds with 503 if a readiness system component is considered down" do
allow(Delayed::Job.connection).to receive(:active?).and_return(false) allow(Delayed::Job.connection).to receive(:active?).and_return(false)
get "deep" get "deep"

View File

@ -169,4 +169,47 @@ describe HealthChecks do
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 3, tags: { type: :deep, key: :deep_check_name_error }) expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 3, tags: { type: :deep, key: :deep_check_name_error })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 4, tags: { type: :deep, key: :deep_check_name_success }) expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 4, tags: { type: :deep, key: :deep_check_name_success })
end end
it "reports pre-computed metrics to statsd" do
allow(InstStatsd::Statsd).to receive(:gauge)
allow(InstStatsd::Statsd).to receive(:timing)
HealthChecks.send_to_statsd(
{
readiness: {
readiness_check_name_error: { time: 1, status: false },
readiness_check_name_success: { time: 2, status: true },
},
deep: {
deep_check_name_error: { time: 3, status: false },
deep_check_name_success: { time: 4, status: true },
},
}
)
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 0, tags: { type: :deep, key: :deep_check_name_error })
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 0, tags: { type: :readiness, key: :readiness_check_name_error })
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 1, tags: { type: :deep, key: :deep_check_name_success })
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 1, tags: { type: :readiness, key: :readiness_check_name_success })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 1, tags: { type: :readiness, key: :readiness_check_name_error })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 2, tags: { type: :readiness, key: :readiness_check_name_success })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 3, tags: { type: :deep, key: :deep_check_name_error })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 4, tags: { type: :deep, key: :deep_check_name_success })
end
it "adds additional tags to the reported metrics" do
allow(InstStatsd::Statsd).to receive(:gauge)
allow(InstStatsd::Statsd).to receive(:timing)
HealthChecks.send_to_statsd(
{
readiness: {
readiness_check_name_error: { time: 1, status: false },
},
}, { cluster: "C1" }
)
expect(InstStatsd::Statsd).to have_received(:gauge).with("canvas.health_checks.status", 0, tags: { type: :readiness, key: :readiness_check_name_error, cluster: "C1" })
expect(InstStatsd::Statsd).to have_received(:timing).with("canvas.health_checks.response_time_ms", 1, tags: { type: :readiness, key: :readiness_check_name_error, cluster: "C1" })
end
end end