switch to redis server side script for failure percent counter

This movies from using multiple redis calls in ruby to redis server side lua scripts for the rolling counter and the math contained in FailurePercentCounter. This provides atomic operation over all the set of redis calls and should provides efficiency over multiple separate call for the data. closes CNVS-34236 ref CNVS-34031 test plan: - Ensure canvas is configured to use redis - In the rails console do the following: Setting.set('service_qatesting_timeout', 1) Setting.set('service_qatesting_timeout_protection_method', 'percentage') Setting.set('service_qatesting_min_samples', 10) - The following must be completed in the rails console in under a minute: Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { sleep 2 } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { sleep 2 } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.timeout_protection('qatesting') { } Canvas.redis.ttl("service:timeouts:qatesting:percent_counter:protection_activated") - Note that after each 'sleep' above that an error report of type 'service_timeout' was generated - Note that after 10 samples, we went into timeout protection by the log message of "Skipping service call due to error count: qatesting 0.2" - Note that the Canvas.redis call returns a number between 0 and 60 Change-Id: Ic04eaab4edb49518e47538feda06dd32a32b49ec Reviewed-on: https://gerrit.instructure.com/99764 Tested-by: Jenkins Reviewed-by: Shahbaz Javeed <sjaveed@instructure.com> Reviewed-by: Spencer Olson <solson@instructure.com> Reviewed-by: Brian Palmer <brianp@instructure.com> QA-Review: KC Naegle <knaegle@instructure.com> Product-Review: Keith T. Garner <kgarner@instructure.com>
2017-01-17 10:28:27 -06:00 · 2017-01-17 10:28:27 -06:00 · 080d4e2e26
parent f011a67d26
commit 080d4e2e26
4 changed files with 48 additions and 30 deletions
--- a/lib/canvas/failure_percent_counter.rb
+++ b/lib/canvas/failure_percent_counter.rb
@ -25,6 +25,11 @@ module Canvas
      @min_samples = min_samples
    end

+    def self.lua
+      @lua ||= ::Redis::Scripting::Module.new(nil,
+        File.join(File.dirname(__FILE__), "failure_percent_counter"))
+    end
+
    def increment_count
      increment(@count_key)
    end
@ -35,32 +40,16 @@ module Canvas

    def failure_rate
      now = Time.now.utc.to_i
-      # ideally we'd want to do all the redis calls in a redis.multi
-      # so they are atomic, but canvas uses an abstraction layer that
-      # doesn't expose that
-      count = cleanup_and_get(@count_key, now)
-      failure = cleanup_and_get(@fail_key, now)
-
-      # If our sample size is too small, let's claim total success
-      return 0.0 if count < @min_samples
-      failure.fdiv(count)
+      result = FailurePercentCounter.lua.run(:failure_rate, [@count_key],
+        [@fail_key, now, @period, @min_samples], @redis)
+      result.to_f
    end

    private
    def increment(key)
      now = Time.now.utc.to_i
-      # ideally we'd want to do all the redis calls in a redis.multi
-      # so they are atomic, but canvas uses an abstraction layer that
-      # doesn't expose that
-      @redis.zadd(key, now, SecureRandom.uuid)
-      @redis.expire(key, @period.ceil)
-      cleanup_and_get(key, now)
-    end
-
-    def cleanup_and_get(key, now)
-      cleanup_time = now - @period
-      @redis.zremrangebyscore(key, 0, cleanup_time)
-      @redis.zcount(key, cleanup_time, now)
+      FailurePercentCounter.lua.run(:increment_counter, [@count_key],
+        [key, now, SecureRandom.uuid, @period.ceil], @redis)
    end
  end
 end
--- a/lib/canvas/failure_percent_counter/failure_rate.lua
+++ b/lib/canvas/failure_percent_counter/failure_rate.lua
@ -0,0 +1,26 @@
+-- This assumes the ordered sets are being used as a rolling counter
+-- and then does math on the size of the two sets
+local count_key = KEYS[1]
+-- Because we have a distributed redis ring, we'll have issues if both
+-- keys are in the key field, so we'll use count, which should always
+-- be bigger as the main key, and pass the fail_key as our first
+-- argument.
+local fail_key = ARGV[1]
+local current_time = tonumber(ARGV[2])
+local period = tonumber(ARGV[3])
+local min_samples = tonumber(ARGV[4])
+
+local cleanup_time = current_time - period
+
+redis.call('ZREMRANGEBYSCORE', count_key, 0, cleanup_time)
+local count = redis.call('ZCOUNT', count_key, cleanup_time, current_time)
+
+if count < min_samples then
+   return '0.0'
+end
+
+redis.call('ZREMRANGEBYSCORE', fail_key, 0, cleanup_time)
+local failures = redis.call('ZCOUNT', fail_key, cleanup_time, current_time)
+
+-- redis converts lua numbers to ints, so we have to return a float as a string
+return tostring(failures / count)
--- a/lib/canvas/failure_percent_counter/increment_counter.lua
+++ b/lib/canvas/failure_percent_counter/increment_counter.lua
@ -0,0 +1,12 @@
+-- This implements a rolling counter in redis using ordered sets
+local cache_key = ARGV[1]
+local current_time = tonumber(ARGV[2])
+local random_value = ARGV[3]
+local period = tonumber(ARGV[4])
+
+local cleanup_time = current_time - period
+
+redis.call('ZADD', cache_key, 'NX', current_time, random_value)
+redis.call('EXPIRE', cache_key, period)
+redis.call('ZREMRANGEBYSCORE', cache_key, 0, cleanup_time)
+return redis.call('ZCOUNT', cache_key, cleanup_time, current_time)
--- a/spec/lib/canvas/failure_percent_counter_spec.rb
+++ b/spec/lib/canvas/failure_percent_counter_spec.rb
@ -27,15 +27,6 @@ if Canvas.redis_enabled?
      expect(counter.increment_count).to eq(3)
    end

-    it "has a functional rolling counter implementation" do
-      # This four calls in order is what makes a rolling counter
-      Canvas.redis.expects(:zadd)
-      Canvas.redis.expects(:expire)
-      Canvas.redis.expects(:zremrangebyscore)
-      Canvas.redis.expects(:zcount).returns(1)
-      counter.increment_count
-    end
-
    it "increments the failure counter" do
      counter.increment_failure
      expect(counter.increment_failure).to eq(2)