switch to redis server side script for failure percent counter

This movies from using multiple redis calls in ruby to redis server
side lua scripts for the rolling counter and the math contained in
FailurePercentCounter. This provides atomic operation over all the set
of redis calls and should provides efficiency over multiple separate
call for the data.

closes CNVS-34236
ref CNVS-34031

test plan:
 - Ensure canvas is configured to use redis
 - In the rails console do the following:
    Setting.set('service_qatesting_timeout', 1)
    Setting.set('service_qatesting_timeout_protection_method',
                'percentage')
    Setting.set('service_qatesting_min_samples', 10)
 - The following must be completed in the rails console in under a
   minute:
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { sleep 2 }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { sleep 2 }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.timeout_protection('qatesting') { }
    Canvas.redis.ttl("service:timeouts:qatesting:percent_counter:protection_activated")
 - Note that after each 'sleep' above that an error report of type
   'service_timeout' was generated
 - Note that after 10 samples, we went into timeout protection by
   the log message of "Skipping service call due to error count: qatesting 0.2"
 - Note that the Canvas.redis call returns a number between 0 and 60

Change-Id: Ic04eaab4edb49518e47538feda06dd32a32b49ec
Reviewed-on: https://gerrit.instructure.com/99764
Tested-by: Jenkins
Reviewed-by: Shahbaz Javeed <sjaveed@instructure.com>
Reviewed-by: Spencer Olson <solson@instructure.com>
Reviewed-by: Brian Palmer <brianp@instructure.com>
QA-Review: KC Naegle <knaegle@instructure.com>
Product-Review: Keith T. Garner <kgarner@instructure.com>
This commit is contained in:
Keith Garner 2017-01-17 10:28:27 -06:00 committed by Keith T. Garner
parent f011a67d26
commit 080d4e2e26
4 changed files with 48 additions and 30 deletions

View File

@ -25,6 +25,11 @@ module Canvas
@min_samples = min_samples
end
def self.lua
@lua ||= ::Redis::Scripting::Module.new(nil,
File.join(File.dirname(__FILE__), "failure_percent_counter"))
end
def increment_count
increment(@count_key)
end
@ -35,32 +40,16 @@ module Canvas
def failure_rate
now = Time.now.utc.to_i
# ideally we'd want to do all the redis calls in a redis.multi
# so they are atomic, but canvas uses an abstraction layer that
# doesn't expose that
count = cleanup_and_get(@count_key, now)
failure = cleanup_and_get(@fail_key, now)
# If our sample size is too small, let's claim total success
return 0.0 if count < @min_samples
failure.fdiv(count)
result = FailurePercentCounter.lua.run(:failure_rate, [@count_key],
[@fail_key, now, @period, @min_samples], @redis)
result.to_f
end
private
def increment(key)
now = Time.now.utc.to_i
# ideally we'd want to do all the redis calls in a redis.multi
# so they are atomic, but canvas uses an abstraction layer that
# doesn't expose that
@redis.zadd(key, now, SecureRandom.uuid)
@redis.expire(key, @period.ceil)
cleanup_and_get(key, now)
end
def cleanup_and_get(key, now)
cleanup_time = now - @period
@redis.zremrangebyscore(key, 0, cleanup_time)
@redis.zcount(key, cleanup_time, now)
FailurePercentCounter.lua.run(:increment_counter, [@count_key],
[key, now, SecureRandom.uuid, @period.ceil], @redis)
end
end
end

View File

@ -0,0 +1,26 @@
-- This assumes the ordered sets are being used as a rolling counter
-- and then does math on the size of the two sets
local count_key = KEYS[1]
-- Because we have a distributed redis ring, we'll have issues if both
-- keys are in the key field, so we'll use count, which should always
-- be bigger as the main key, and pass the fail_key as our first
-- argument.
local fail_key = ARGV[1]
local current_time = tonumber(ARGV[2])
local period = tonumber(ARGV[3])
local min_samples = tonumber(ARGV[4])
local cleanup_time = current_time - period
redis.call('ZREMRANGEBYSCORE', count_key, 0, cleanup_time)
local count = redis.call('ZCOUNT', count_key, cleanup_time, current_time)
if count < min_samples then
return '0.0'
end
redis.call('ZREMRANGEBYSCORE', fail_key, 0, cleanup_time)
local failures = redis.call('ZCOUNT', fail_key, cleanup_time, current_time)
-- redis converts lua numbers to ints, so we have to return a float as a string
return tostring(failures / count)

View File

@ -0,0 +1,12 @@
-- This implements a rolling counter in redis using ordered sets
local cache_key = ARGV[1]
local current_time = tonumber(ARGV[2])
local random_value = ARGV[3]
local period = tonumber(ARGV[4])
local cleanup_time = current_time - period
redis.call('ZADD', cache_key, 'NX', current_time, random_value)
redis.call('EXPIRE', cache_key, period)
redis.call('ZREMRANGEBYSCORE', cache_key, 0, cleanup_time)
return redis.call('ZCOUNT', cache_key, cleanup_time, current_time)

View File

@ -27,15 +27,6 @@ if Canvas.redis_enabled?
expect(counter.increment_count).to eq(3)
end
it "has a functional rolling counter implementation" do
# This four calls in order is what makes a rolling counter
Canvas.redis.expects(:zadd)
Canvas.redis.expects(:expire)
Canvas.redis.expects(:zremrangebyscore)
Canvas.redis.expects(:zcount).returns(1)
counter.increment_count
end
it "increments the failure counter" do
counter.increment_failure
expect(counter.increment_failure).to eq(2)