add deep critical and secondary health checks

includes:
 • timeout protection (defaults to 5 seconds)
 • the ability to run non-blocking component checks for the
   /deep endpoint
 • caches component deep checks to help mitigate request cost
   to services; setting cache expiry to 60 seconds

closes FOO-2355
closes FOO-2356
closes FOO-2369

flag = none

test plan:
 • navigate to /readiness & /deep endpoints
 • verify json response is returned
 • verify json response object conforms to:
   • https://app.swaggerhub.com/apis/mycargus/health-checks-api/

Change-Id: Ie12b9260e91de6c3d18f25cd4a81cbfa7ebefcb4
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/273517
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
Reviewed-by: Simon Williams <simon@instructure.com>
Reviewed-by: Jacob Burroughs <jburroughs@instructure.com>
QA-Review: August Thornton <august@instructure.com>
Product-Review: August Thornton <august@instructure.com>
This commit is contained in:
August Thornton 2021-09-14 08:29:29 -06:00
parent e9b03d7a31
commit 30f4f304c1
3 changed files with 253 additions and 90 deletions

View File

@ -19,16 +19,16 @@
#
class InfoController < ApplicationController
skip_before_action :load_account, :only => [:health_check, :readiness]
skip_before_action :load_user, :only => [:health_check, :readiness, :browserconfig]
skip_before_action :load_account, :only => [:health_check, :readiness, :deep]
skip_before_action :load_user, :only => [:health_check, :readiness, :deep, :browserconfig]
def styleguide
render :layout => "layouts/styleguide"
end
def message_redirect
m = AssetSignature.find_by_signature(Message, params[:id])
if m && m.url
m = AssetSignature.find_by(Message, params[:id])
if m&.url
redirect_to m.url
else
redirect_to "http://#{HostUrl.default_host}/"
@ -37,7 +37,7 @@ class InfoController < ApplicationController
def help_links
current_user_roles = @current_user.try(:roles, @domain_root_account) || []
links = @domain_root_account && @domain_root_account.help_links
links = @domain_root_account&.help_links
links = links.select do |link|
available_to = link[:available_to] || []
@ -98,76 +98,6 @@ class InfoController < ApplicationController
end
end
def readiness
# This action provides a clear signal for assessing system components that are "owned"
# by Canvas and are ultimately responsible for being alive and able to serve consumer traffic
#
# Readiness Checks
#
# returns a PrefixProxy instance, treated as truthy
consul = -> { DynamicSettings.find(tree: :private)[:readiness].nil? }
# ensures brandable_css_bundles_with_deps exists, returns a string (path), treated as truthy
css = -> { css_url_for("common") }
# returns the value of the block <integer>, treated as truthy
filesystem = -> do
Tempfile.open('readiness', ENV['TMPDIR'] || Dir.tmpdir) { |f| f.write('readiness') }
end
# returns a boolean
jobs = -> { Delayed::Job.connection.active? }
# ensures webpack worked; returns a string, treated as truthy
js = -> { ActionController::Base.helpers.javascript_url("#{js_base_url}/common") }
# returns a boolean
postgres = -> { Account.connection.active? }
# nil response treated as truthy
redis = -> { MultiCache.cache.fetch('readiness').nil? }
# ensures `gulp rev` has ran; returns a string, treated as truthy
rev_manifest = -> { Canvas::Cdn::RevManifest.gulp_manifest.values.first }
# ensures we retrieved something back from Vault; returns a boolean
vault = -> { !Canvas::Vault.read("#{Canvas::Vault.kv_mount}/data/secrets").nil? }
components = {
common_css: readiness_check(css),
common_js: readiness_check(js),
consul: readiness_check(consul),
filesystem: readiness_check(filesystem),
jobs: readiness_check(jobs),
postgresql: readiness_check(postgres),
redis: readiness_check(redis),
rev_manifest: readiness_check(rev_manifest),
vault: readiness_check(vault)
}
failed = components.reject { |_k, v| v[:status] }.map(&:first)
render_readiness_json(components, failed.any? ? 503 : 200)
end
def readiness_check(component)
begin
status = false
time = Benchmark.ms { status = component.call }
rescue => e
Canvas::Errors.capture_exception(:readiness, e, :error)
end
{ time: time, status: status }
end
def render_readiness_json(components, status_code)
render json: {
status: status_code,
components:
components.map do |k, v|
name = k
status = v[:status] ? 200 : 503
time = v[:time]
{ 'name' => name, 'status' => status, 'response_time_ms' => time }
end
},
status: status_code
end
private :readiness_check, :render_readiness_json
# for windows live tiles
def browserconfig
cancel_cache_buster
@ -184,14 +114,14 @@ class InfoController < ApplicationController
@needs_cookies = true if params[:reason] == 'needs_cookies'
return render_unauthorized_action
when 422
raise ActionController::InvalidAuthenticityToken.new('test_error')
raise ActionController::InvalidAuthenticityToken, 'test_error'
else
@not_found_message = '(test_error message details)' if params[:message].present?
raise RequestError.new('test_error', params[:status].to_i)
end
end
render status: 404, template: "shared/errors/404_message"
render status: :not_found, template: "shared/errors/404_message"
end
def web_app_manifest
@ -229,4 +159,134 @@ class InfoController < ApplicationController
display: "minimal-ui"
}
end
def readiness(is_deep_check: false)
# This action provides a clear signal for assessing system components that are "owned"
# by Canvas and are ultimately responsible for being alive and able to serve consumer traffic
#
# Readiness Checks
#
check = ->(&proc) { component_check(proc, is_deep_check) }
components = {
# ensures brandable_css_bundles_with_deps exists, returns a string (path), treated as truthy
common_css: check.call { css_url_for('common') },
# ensures webpack worked; returns a string, treated as truthy
common_js: check.call do
ActionController::Base.helpers.javascript_url("#{js_base_url}/common")
end,
# returns a PrefixProxy instance, treated as truthy
consul: check.call { DynamicSettings.find(tree: :private)[:readiness].nil? },
# returns the value of the block <integer>, treated as truthy
filesystem: check.call do
Tempfile.open('readiness', ENV['TMPDIR'] || Dir.tmpdir) { |f| f.write('readiness') }
end,
# returns a boolean
jobs: check.call { Delayed::Job.connection.active? },
# returns a boolean
postgresql: check.call { Account.connection.active? },
# nil response treated as truthy
redis: check.call { MultiCache.cache.fetch('readiness').nil? },
# ensures `gulp rev` has ran; returns a string, treated as truthy
rev_manifest: check.call { Canvas::Cdn::RevManifest.gulp_manifest.values.first },
# ensures we retrieved something back from Vault; returns a boolean
vault: check.call { !Canvas::Vault.read("#{Canvas::Vault.kv_mount}/data/secrets").nil? }
}
failed = components.reject { |_k, v| v[:status] }.map(&:first)
render_readiness_json(components, failed.any? ? 503 : 200, is_deep_check)
end
def deep
# This action provides a clear signal for assessing our critical and secondary dependencies
# such that we can successfully complete consumer requests
#
# Deep Checks
#
deep_check =
Rails.cache.fetch(:deep_health_check, expires_in: 60.seconds) do
check = ->(&proc) do
thread = Thread.new do
Thread.current.report_on_exception = false
proc.call
end
component_check(thread, true)
end
critical_checks = {
default_shard: check.call { Shard.connection.active? }
}
secondary_checks = {} # can be manually or conditionally added
if Canvadocs.enabled?
secondary_checks[:canvadocs] = check.call do
CanvasHttp
.get(URI.join(Canvadocs.config['base_url'], '/readiness').to_s)
.is_a?(Net::HTTPSuccess)
end
end
if PageView.pv4?
secondary_checks[:pv4] = check.call do
CanvasHttp
.get(URI.join(ConfigFile.load('pv4')['uri'], '/health_check').to_s)
.is_a?(Net::HTTPSuccess)
end
end
{ critical: critical_checks, secondary: secondary_checks }
end
failed = deep_check[:critical].reject { |_k, v| v[:status] }.map(&:first)
render_deep_json(deep_check[:critical], deep_check[:secondary], failed.any? ? 503 : 200)
end
private
def component_check(component, is_deep_check)
status = false
message = 'service is up'
exception_type = is_deep_check ? :deep_health_check : :readiness_health_check
timeout = Setting.get('healthcheck_timelimit', 5.seconds.to_s).to_f
response_time_ms =
Benchmark.ms do
Timeout.timeout(timeout, Timeout::Error) do
status = component.is_a?(Thread) ? component.value : component.call
end
rescue Timeout::Error => e
message = e.message
Canvas::Errors.capture_exception(exception_type, e.message, :warn)
rescue => e
message = e.message
Canvas::Errors.capture_exception(exception_type, e, :error)
end
{ status: status, message: message, time: response_time_ms }
end
def render_readiness_json(components, status_code, is_deep_check)
readiness_json = { status: status_code, components: components_to_hash(components) }
return readiness_json if is_deep_check
render json: readiness_json, status: status_code
end
def render_deep_json(critical, secondary, status_code)
readiness_response = readiness(is_deep_check: true)
status = readiness_response[:status] == 503 ? readiness_response[:status] : status_code
render json: {
status: status,
readiness: readiness_response,
critical: components_to_hash(critical),
secondary: components_to_hash(secondary)
}, status: status
end
def components_to_hash(components)
components.map do |name, value|
status = value[:status] ? 200 : 503
message = value[:message]
time = value[:time]
{ name: name, status: status, message: message, response_time_ms: time }
end
end
end

View File

@ -934,6 +934,8 @@ CanvasRails::Application.routes.draw do
get 'health_check' => 'info#health_check'
get 'health_prognosis' => 'info#health_prognosis'
get 'readiness' => 'info#readiness'
get 'deep' => 'info#deep'
get 'web-app-manifest/manifest.json' => 'info#web_app_manifest'
get 'browserconfig.xml', to: 'info#browserconfig', defaults: { format: 'xml' }

View File

@ -80,11 +80,14 @@ describe InfoController do
end
describe "GET 'readiness'" do
it 'responds with 200 if all system components are alive and serving' do
before(:each) do
allow(Account.connection).to receive(:active?).and_return(true)
allow(MultiCache.cache).to receive(:fetch).and_call_original
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
end
it 'responds with 200 if all system components are alive and serving' do
get 'readiness'
expect(response).to be_successful
json = JSON.parse(response.body)
@ -92,9 +95,6 @@ describe InfoController do
end
it 'responds with 503 if a system component is considered down' do
allow(Account.connection).to receive(:active?).and_return(true)
allow(MultiCache.cache).to receive(:fetch).and_call_original
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
allow(Delayed::Job.connection).to receive(:active?).and_return(false)
get 'readiness'
expect(response.code).to eq '503'
@ -102,11 +102,8 @@ describe InfoController do
expect(json['status']).to eq 503
end
it 'catches any exceptions thrown and log them as errors' do
allow(Account.connection).to receive(:active?).and_return(true)
allow(MultiCache.cache).to receive(:fetch).and_call_original
it 'catchs any exceptions thrown and logs them as errors' do
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_raise(Redis::TimeoutError)
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
expect(Canvas::Errors).to receive(:capture_exception).once
get 'readiness'
expect(response.code).to eq '503'
@ -115,11 +112,18 @@ describe InfoController do
expect(redis['status']).to eq 503
end
it 'catchs any timeouts thrown and logs them as warnings' do
allow(Timeout).to receive(:timeout).and_raise(Timeout::Error)
expect(Canvas::Errors).to receive(:capture_exception)
.at_least(:once)
.with(:readiness_health_check, 'Timeout::Error', :warn)
get 'readiness'
expect(response.code).to eq '503'
json = JSON.parse(response.body)
expect(json['status']).to eq 503
end
it 'returns all dependent system components in json response' do
allow(Account.connection).to receive(:active?).and_return(true)
allow(MultiCache.cache).to receive(:fetch).and_call_original
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
get 'readiness'
expect(response).to be_successful
components = JSON.parse(response.body)['components']
@ -130,6 +134,103 @@ describe InfoController do
end
end
describe "GET 'deep'" do
let(:canvas_http) { class_double(CanvasHttp) }
before(:each) do
allow(Account.connection).to receive(:active?).and_return(true)
allow(MultiCache.cache).to receive(:fetch).and_call_original
allow(MultiCache.cache).to receive(:fetch).with('readiness').and_return(nil)
allow(Delayed::Job.connection).to receive(:active?).and_return(true)
allow(Shard.connection).to receive(:active?).and_return(true)
allow(Canvadocs).to receive(:enabled?).and_return(true)
allow(PageView).to receive(:pv4?).and_return(true)
allow(Canvadocs).to receive(:config)
.and_return({ 'base_url' => 'https://canvadocs.instructure.com/' })
allow(ConfigFile).to receive(:load).and_call_original
allow(ConfigFile).to receive(:load)
.with('pv4').and_return({ 'uri' => 'https://pv4.instructure.com/api/123/' })
end
it 'renders readiness check within json response' do
get 'deep'
expect(response).to be_successful
json = JSON.parse(response.body)
expect(json).to have_key('readiness')
expect(json['readiness']['components'].count).to be > 0
end
it 'responds with 503 if a readiness system component is considered down' do
allow(Delayed::Job.connection).to receive(:active?).and_return(false)
get 'deep'
expect(response.code).to eq '503'
json = JSON.parse(response.body)
expect(json['status']).to eq 503
end
it 'returns 503 if critical dependency check fails and readiness response is 200' do
allow(Shard.connection).to receive(:active?).and_return(false)
get 'deep'
expect(response.code).to eq '503'
json = JSON.parse(response.body)
expect(json['status']).to eq 503
end
it 'catches any secondary dependency check exceptions without failing the deep check' do
allow(CanvasHttp).to receive(:get).and_raise(Timeout::Error)
expect(Canvas::Errors).to receive(:capture_exception)
.at_least(:twice)
.with(:deep_health_check, 'Timeout::Error', :warn)
get 'deep'
expect(response.code).to eq '200'
secondary = JSON.parse(response.body)['secondary']
canvadocs = secondary.find { |c| c['name'] == 'canvadocs' }
expect(canvadocs['status']).to eq 503
end
it 'catches any timeouts thrown and logs them as warnings' do
allow(Timeout).to receive(:timeout).and_raise(Timeout::Error)
expect(Canvas::Errors).to receive(:capture_exception)
.at_least(:once)
.with(:deep_health_check, 'Timeout::Error', :warn)
get 'deep'
expect(response.code).to eq '503'
json = JSON.parse(response.body)
expect(json['status']).to eq 503
end
it 'returns critical dependencies in json response' do
get 'deep'
expect(response).to be_successful
critical = JSON.parse(response.body)['critical']
critical.each do |dep|
expect(dep['name']).to be_truthy
expect(dep['status']).to eq 200
end
end
it 'returns secondary dependencies in json response' do
allow(canvas_http).to receive(:get).and_return(Net::HTTPSuccess)
get 'deep'
expect(response).to be_successful
secondary = JSON.parse(response.body)['secondary']
secondary.each do |dep|
expect(dep['name']).to be_truthy
expect(dep['status']).to eq 200
end
end
it 'returns secondary dependencies in json response only if enabled' do
allow(Canvadocs).to receive(:enabled?).and_return(false)
allow(PageView).to receive(:pv4?).and_return(false)
allow(canvas_http).to receive(:get).and_return(Net::HTTPSuccess)
get 'deep'
expect(response).to be_successful
secondary = JSON.parse(response.body)['secondary']
expect(secondary).to eq []
end
end
describe "GET 'help_links'" do
it "works" do
get 'help_links'
@ -178,7 +279,7 @@ describe InfoController do
get 'help_links'
links = json_parse(response.body)
expect(links.select { |link| link[:text] == 'Ask Your Instructor a Question' }.size).to eq 0
expect(links.count { |link| link[:text] == 'Ask Your Instructor a Question' }).to eq 0
end
end