add deep critical and secondary health checks
includes: • timeout protection (defaults to 5 seconds) • the ability to run non-blocking component checks for the /deep endpoint • caches component deep checks to help mitigate request cost to services; setting cache expiry to 60 seconds closes FOO-2355 closes FOO-2356 closes FOO-2369 flag = none test plan: • navigate to /readiness & /deep endpoints • verify json response is returned • verify json response object conforms to: • https://app.swaggerhub.com/apis/mycargus/health-checks-api/ Change-Id: Ie12b9260e91de6c3d18f25cd4a81cbfa7ebefcb4 Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/273517 Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com> Reviewed-by: Simon Williams <simon@instructure.com> Reviewed-by: Jacob Burroughs <jburroughs@instructure.com> QA-Review: August Thornton <august@instructure.com> Product-Review: August Thornton <august@instructure.com>
This commit is contained in:
parent
e9b03d7a31
commit
30f4f304c1
|
@ -19,16 +19,16 @@
|
|||
#
|
||||
|
||||
class InfoController < ApplicationController
|
||||
skip_before_action :load_account, :only => [:health_check, :readiness]
|
||||
skip_before_action :load_user, :only => [:health_check, :readiness, :browserconfig]
|
||||
skip_before_action :load_account, :only => [:health_check, :readiness, :deep]
|
||||
skip_before_action :load_user, :only => [:health_check, :readiness, :deep, :browserconfig]
|
||||
|
||||
def styleguide
|
||||
render :layout => "layouts/styleguide"
|
||||
end
|
||||
|
||||
def message_redirect
|
||||
m = AssetSignature.find_by_signature(Message, params[:id])
|
||||
if m && m.url
|
||||
m = AssetSignature.find_by(Message, params[:id])
|
||||
if m&.url
|
||||
redirect_to m.url
|
||||
else
|
||||
redirect_to "http://#{HostUrl.default_host}/"
|
||||
|
@ -37,7 +37,7 @@ class InfoController < ApplicationController
|
|||
|
||||
def help_links
|
||||
current_user_roles = @current_user.try(:roles, @domain_root_account) || []
|
||||
links = @domain_root_account && @domain_root_account.help_links
|
||||
links = @domain_root_account&.help_links
|
||||
|
||||
links = links.select do |link|
|
||||
available_to = link[:available_to] || []
|
||||
|
@ -98,76 +98,6 @@ class InfoController < ApplicationController
|
|||
end
|
||||
end
|
||||
|
||||
def readiness
|
||||
# This action provides a clear signal for assessing system components that are "owned"
|
||||
# by Canvas and are ultimately responsible for being alive and able to serve consumer traffic
|
||||
#
|
||||
# Readiness Checks
|
||||
#
|
||||
# returns a PrefixProxy instance, treated as truthy
|
||||
consul = -> { DynamicSettings.find(tree: :private)[:readiness].nil? }
|
||||
# ensures brandable_css_bundles_with_deps exists, returns a string (path), treated as truthy
|
||||
css = -> { css_url_for("common") }
|
||||
# returns the value of the block <integer>, treated as truthy
|
||||
filesystem = -> do
|
||||
Tempfile.open('readiness', ENV['TMPDIR'] || Dir.tmpdir) { |f| f.write('readiness') }
|
||||
end
|
||||
# returns a boolean
|
||||
jobs = -> { Delayed::Job.connection.active? }
|
||||
# ensures webpack worked; returns a string, treated as truthy
|
||||
js = -> { ActionController::Base.helpers.javascript_url("#{js_base_url}/common") }
|
||||
# returns a boolean
|
||||
postgres = -> { Account.connection.active? }
|
||||
# nil response treated as truthy
|
||||
redis = -> { MultiCache.cache.fetch('readiness').nil? }
|
||||
# ensures `gulp rev` has ran; returns a string, treated as truthy
|
||||
rev_manifest = -> { Canvas::Cdn::RevManifest.gulp_manifest.values.first }
|
||||
# ensures we retrieved something back from Vault; returns a boolean
|
||||
vault = -> { !Canvas::Vault.read("#{Canvas::Vault.kv_mount}/data/secrets").nil? }
|
||||
|
||||
components = {
|
||||
common_css: readiness_check(css),
|
||||
common_js: readiness_check(js),
|
||||
consul: readiness_check(consul),
|
||||
filesystem: readiness_check(filesystem),
|
||||
jobs: readiness_check(jobs),
|
||||
postgresql: readiness_check(postgres),
|
||||
redis: readiness_check(redis),
|
||||
rev_manifest: readiness_check(rev_manifest),
|
||||
vault: readiness_check(vault)
|
||||
}
|
||||
|
||||
failed = components.reject { |_k, v| v[:status] }.map(&:first)
|
||||
render_readiness_json(components, failed.any? ? 503 : 200)
|
||||
end
|
||||
|
||||
def readiness_check(component)
|
||||
begin
|
||||
status = false
|
||||
time = Benchmark.ms { status = component.call }
|
||||
rescue => e
|
||||
Canvas::Errors.capture_exception(:readiness, e, :error)
|
||||
end
|
||||
|
||||
{ time: time, status: status }
|
||||
end
|
||||
|
||||
def render_readiness_json(components, status_code)
|
||||
render json: {
|
||||
status: status_code,
|
||||
components:
|
||||
components.map do |k, v|
|
||||
name = k
|
||||
status = v[:status] ? 200 : 503
|
||||
time = v[:time]
|
||||
{ 'name' => name, 'status' => status, 'response_time_ms' => time }
|
||||
end
|
||||
},
|
||||
status: status_code
|
||||
end
|
||||
|
||||
private :readiness_check, :render_readiness_json
|
||||
|
||||
# for windows live tiles
|
||||
def browserconfig
|
||||
cancel_cache_buster
|
||||
|
@ -184,14 +114,14 @@ class InfoController < ApplicationController
|
|||
@needs_cookies = true if params[:reason] == 'needs_cookies'
|
||||
return render_unauthorized_action
|
||||
when 422
|
||||
raise ActionController::InvalidAuthenticityToken.new('test_error')
|
||||
raise ActionController::InvalidAuthenticityToken, 'test_error'
|
||||
else
|
||||
@not_found_message = '(test_error message details)' if params[:message].present?
|
||||
raise RequestError.new('test_error', params[:status].to_i)
|
||||
end
|
||||
end
|
||||
|
||||
render status: 404, template: "shared/errors/404_message"
|
||||
render status: :not_found, template: "shared/errors/404_message"
|
||||
end
|
||||
|
||||
def web_app_manifest
|
||||
|
@ -229,4 +159,134 @@ class InfoController < ApplicationController
|
|||
display: "minimal-ui"
|
||||
}
|
||||
end
|
||||
|
||||
def readiness(is_deep_check: false)
|
||||
# This action provides a clear signal for assessing system components that are "owned"
|
||||
# by Canvas and are ultimately responsible for being alive and able to serve consumer traffic
|
||||
#
|
||||
# Readiness Checks
|
||||
#
|
||||
check = ->(&proc) { component_check(proc, is_deep_check) }
|
||||
components = {
|
||||
# ensures brandable_css_bundles_with_deps exists, returns a string (path), treated as truthy
|
||||
common_css: check.call { css_url_for('common') },
|
||||
# ensures webpack worked; returns a string, treated as truthy
|
||||
common_js: check.call do
|
||||
ActionController::Base.helpers.javascript_url("#{js_base_url}/common")
|
||||
end,
|
||||
# returns a PrefixProxy instance, treated as truthy
|
||||
consul: check.call { DynamicSettings.find(tree: :private)[:readiness].nil? },
|
||||
# returns the value of the block <integer>, treated as truthy
|
||||
filesystem: check.call do
|
||||
Tempfile.open('readiness', ENV['TMPDIR'] || Dir.tmpdir) { |f| f.write('readiness') }
|
||||
end,
|
||||
# returns a boolean
|
||||
jobs: check.call { Delayed::Job.connection.active? },
|
||||
# returns a boolean
|
||||
postgresql: check.call { Account.connection.active? },
|
||||
# nil response treated as truthy
|
||||
redis: check.call { MultiCache.cache.fetch('readiness').nil? },
|
||||
# ensures `gulp rev` has ran; returns a string, treated as truthy
|
||||
rev_manifest: check.call { Canvas::Cdn::RevManifest.gulp_manifest.values.first },
|
||||
# ensures we retrieved something back from Vault; returns a boolean
|
||||
vault: check.call { !Canvas::Vault.read("#{Canvas::Vault.kv_mount}/data/secrets").nil? }
|
||||
}
|
||||
failed = components.reject { |_k, v| v[:status] }.map(&:first)
|
||||
|
||||
render_readiness_json(components, failed.any? ? 503 : 200, is_deep_check)
|
||||
end
|
||||
|
||||
def deep
|
||||
# This action provides a clear signal for assessing our critical and secondary dependencies
|
||||
# such that we can successfully complete consumer requests
|
||||
#
|
||||
# Deep Checks
|
||||
#
|
||||
deep_check =
|
||||
Rails.cache.fetch(:deep_health_check, expires_in: 60.seconds) do
|
||||
check = ->(&proc) do
|
||||
thread = Thread.new do
|
||||
Thread.current.report_on_exception = false
|
||||
proc.call
|
||||
end
|
||||
component_check(thread, true)
|
||||
end
|
||||
critical_checks = {
|
||||
default_shard: check.call { Shard.connection.active? }
|
||||
}
|
||||
secondary_checks = {} # can be manually or conditionally added
|
||||
|
||||
if Canvadocs.enabled?
|
||||
secondary_checks[:canvadocs] = check.call do
|
||||
CanvasHttp
|
||||
.get(URI.join(Canvadocs.config['base_url'], '/readiness').to_s)
|
||||
.is_a?(Net::HTTPSuccess)
|
||||
end
|
||||
end
|
||||
|
||||
if PageView.pv4?
|
||||
secondary_checks[:pv4] = check.call do
|
||||
CanvasHttp
|
||||
.get(URI.join(ConfigFile.load('pv4')['uri'], '/health_check').to_s)
|
||||
.is_a?(Net::HTTPSuccess)
|
||||
end
|
||||
end
|
||||
|
||||
{ critical: critical_checks, secondary: secondary_checks }
|
||||
end
|
||||
|
||||
failed = deep_check[:critical].reject { |_k, v| v[:status] }.map(&:first)
|
||||
render_deep_json(deep_check[:critical], deep_check[:secondary], failed.any? ? 503 : 200)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def component_check(component, is_deep_check)
|
||||
status = false
|
||||
message = 'service is up'
|
||||
exception_type = is_deep_check ? :deep_health_check : :readiness_health_check
|
||||
timeout = Setting.get('healthcheck_timelimit', 5.seconds.to_s).to_f
|
||||
response_time_ms =
|
||||
Benchmark.ms do
|
||||
Timeout.timeout(timeout, Timeout::Error) do
|
||||
status = component.is_a?(Thread) ? component.value : component.call
|
||||
end
|
||||
rescue Timeout::Error => e
|
||||
message = e.message
|
||||
Canvas::Errors.capture_exception(exception_type, e.message, :warn)
|
||||
rescue => e
|
||||
message = e.message
|
||||
Canvas::Errors.capture_exception(exception_type, e, :error)
|
||||
end
|
||||
|
||||
{ status: status, message: message, time: response_time_ms }
|
||||
end
|
||||
|
||||
def render_readiness_json(components, status_code, is_deep_check)
|
||||
readiness_json = { status: status_code, components: components_to_hash(components) }
|
||||
return readiness_json if is_deep_check
|
||||
|
||||
render json: readiness_json, status: status_code
|
||||
end
|
||||
|
||||
def render_deep_json(critical, secondary, status_code)
|
||||
readiness_response = readiness(is_deep_check: true)
|
||||
status = readiness_response[:status] == 503 ? readiness_response[:status] : status_code
|
||||
|
||||
render json: {
|
||||
status: status,
|
||||
readiness: readiness_response,
|
||||
critical: components_to_hash(critical),
|
||||
secondary: components_to_hash(secondary)
|
||||
}, status: status
|
||||
end
|
||||
|
||||
def components_to_hash(components)
|
||||
components.map do |name, value|
|
||||
status = value[:status] ? 200 : 503
|
||||
message = value[:message]
|
||||
time = value[:time]
|
||||
{ name: name, status: status, message: message, response_time_ms: time }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -934,6 +934,8 @@ CanvasRails::Application.routes.draw do
|
|||
get 'health_check' => 'info#health_check'
|
||||
get 'health_prognosis' => 'info#health_prognosis'
|
||||
get 'readiness' => 'info#readiness'
|
||||
get 'deep' => 'info#deep'
|
||||
|
||||
get 'web-app-manifest/manifest.json' => 'info#web_app_manifest'
|
||||
|
||||
get 'browserconfig.xml', to: 'info#browserconfig', defaults: { format: 'xml' }
|
||||
|
|
|
@ -80,11 +80,14 @@ describe InfoController do
|
|||
end
|
||||
|
||||
describe "GET 'readiness'" do
|
||||
it 'responds with 200 if all system components are alive and serving' do
|
||||
before(:each) do
|
||||
allow(Account.connection).to receive(:active?).and_return(true)
|
||||
allow(MultiCache.cache).to receive(:fetch).and_call_original
|
||||
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
|
||||
end
|
||||
|
||||
it 'responds with 200 if all system components are alive and serving' do
|
||||
get 'readiness'
|
||||
expect(response).to be_successful
|
||||
json = JSON.parse(response.body)
|
||||
|
@ -92,9 +95,6 @@ describe InfoController do
|
|||
end
|
||||
|
||||
it 'responds with 503 if a system component is considered down' do
|
||||
allow(Account.connection).to receive(:active?).and_return(true)
|
||||
allow(MultiCache.cache).to receive(:fetch).and_call_original
|
||||
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(false)
|
||||
get 'readiness'
|
||||
expect(response.code).to eq '503'
|
||||
|
@ -102,11 +102,8 @@ describe InfoController do
|
|||
expect(json['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'catches any exceptions thrown and log them as errors' do
|
||||
allow(Account.connection).to receive(:active?).and_return(true)
|
||||
allow(MultiCache.cache).to receive(:fetch).and_call_original
|
||||
it 'catchs any exceptions thrown and logs them as errors' do
|
||||
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_raise(Redis::TimeoutError)
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
|
||||
expect(Canvas::Errors).to receive(:capture_exception).once
|
||||
get 'readiness'
|
||||
expect(response.code).to eq '503'
|
||||
|
@ -115,11 +112,18 @@ describe InfoController do
|
|||
expect(redis['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'catchs any timeouts thrown and logs them as warnings' do
|
||||
allow(Timeout).to receive(:timeout).and_raise(Timeout::Error)
|
||||
expect(Canvas::Errors).to receive(:capture_exception)
|
||||
.at_least(:once)
|
||||
.with(:readiness_health_check, 'Timeout::Error', :warn)
|
||||
get 'readiness'
|
||||
expect(response.code).to eq '503'
|
||||
json = JSON.parse(response.body)
|
||||
expect(json['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'returns all dependent system components in json response' do
|
||||
allow(Account.connection).to receive(:active?).and_return(true)
|
||||
allow(MultiCache.cache).to receive(:fetch).and_call_original
|
||||
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
|
||||
get 'readiness'
|
||||
expect(response).to be_successful
|
||||
components = JSON.parse(response.body)['components']
|
||||
|
@ -130,6 +134,103 @@ describe InfoController do
|
|||
end
|
||||
end
|
||||
|
||||
describe "GET 'deep'" do
|
||||
let(:canvas_http) { class_double(CanvasHttp) }
|
||||
|
||||
before(:each) do
|
||||
allow(Account.connection).to receive(:active?).and_return(true)
|
||||
allow(MultiCache.cache).to receive(:fetch).and_call_original
|
||||
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
|
||||
allow(Shard.connection).to receive(:active?).and_return(true)
|
||||
allow(Canvadocs).to receive(:enabled?).and_return(true)
|
||||
allow(PageView).to receive(:pv4?).and_return(true)
|
||||
allow(Canvadocs).to receive(:config)
|
||||
.and_return({ 'base_url' => 'https://canvadocs.instructure.com/' })
|
||||
allow(ConfigFile).to receive(:load).and_call_original
|
||||
allow(ConfigFile).to receive(:load)
|
||||
.with('pv4').and_return({ 'uri' => 'https://pv4.instructure.com/api/123/' })
|
||||
end
|
||||
|
||||
it 'renders readiness check within json response' do
|
||||
get 'deep'
|
||||
expect(response).to be_successful
|
||||
json = JSON.parse(response.body)
|
||||
expect(json).to have_key('readiness')
|
||||
expect(json['readiness']['components'].count).to be > 0
|
||||
end
|
||||
|
||||
it 'responds with 503 if a readiness system component is considered down' do
|
||||
allow(Delayed::Job.connection).to receive(:active?).and_return(false)
|
||||
get 'deep'
|
||||
expect(response.code).to eq '503'
|
||||
json = JSON.parse(response.body)
|
||||
expect(json['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'returns 503 if critical dependency check fails and readiness response is 200' do
|
||||
allow(Shard.connection).to receive(:active?).and_return(false)
|
||||
get 'deep'
|
||||
expect(response.code).to eq '503'
|
||||
json = JSON.parse(response.body)
|
||||
expect(json['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'catches any secondary dependency check exceptions without failing the deep check' do
|
||||
allow(CanvasHttp).to receive(:get).and_raise(Timeout::Error)
|
||||
expect(Canvas::Errors).to receive(:capture_exception)
|
||||
.at_least(:twice)
|
||||
.with(:deep_health_check, 'Timeout::Error', :warn)
|
||||
get 'deep'
|
||||
expect(response.code).to eq '200'
|
||||
secondary = JSON.parse(response.body)['secondary']
|
||||
canvadocs = secondary.find { |c| c['name'] == 'canvadocs' }
|
||||
expect(canvadocs['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'catches any timeouts thrown and logs them as warnings' do
|
||||
allow(Timeout).to receive(:timeout).and_raise(Timeout::Error)
|
||||
expect(Canvas::Errors).to receive(:capture_exception)
|
||||
.at_least(:once)
|
||||
.with(:deep_health_check, 'Timeout::Error', :warn)
|
||||
get 'deep'
|
||||
expect(response.code).to eq '503'
|
||||
json = JSON.parse(response.body)
|
||||
expect(json['status']).to eq 503
|
||||
end
|
||||
|
||||
it 'returns critical dependencies in json response' do
|
||||
get 'deep'
|
||||
expect(response).to be_successful
|
||||
critical = JSON.parse(response.body)['critical']
|
||||
critical.each do |dep|
|
||||
expect(dep['name']).to be_truthy
|
||||
expect(dep['status']).to eq 200
|
||||
end
|
||||
end
|
||||
|
||||
it 'returns secondary dependencies in json response' do
|
||||
allow(canvas_http).to receive(:get).and_return(Net::HTTPSuccess)
|
||||
get 'deep'
|
||||
expect(response).to be_successful
|
||||
secondary = JSON.parse(response.body)['secondary']
|
||||
secondary.each do |dep|
|
||||
expect(dep['name']).to be_truthy
|
||||
expect(dep['status']).to eq 200
|
||||
end
|
||||
end
|
||||
|
||||
it 'returns secondary dependencies in json response only if enabled' do
|
||||
allow(Canvadocs).to receive(:enabled?).and_return(false)
|
||||
allow(PageView).to receive(:pv4?).and_return(false)
|
||||
allow(canvas_http).to receive(:get).and_return(Net::HTTPSuccess)
|
||||
get 'deep'
|
||||
expect(response).to be_successful
|
||||
secondary = JSON.parse(response.body)['secondary']
|
||||
expect(secondary).to eq []
|
||||
end
|
||||
end
|
||||
|
||||
describe "GET 'help_links'" do
|
||||
it "works" do
|
||||
get 'help_links'
|
||||
|
@ -178,7 +279,7 @@ describe InfoController do
|
|||
|
||||
get 'help_links'
|
||||
links = json_parse(response.body)
|
||||
expect(links.select { |link| link[:text] == 'Ask Your Instructor a Question' }.size).to eq 0
|
||||
expect(links.count { |link| link[:text] == 'Ask Your Instructor a Question' }).to eq 0
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue