MSFT Sync: count retries per-step, and backoff

refs INTEROP-6638
flag=microsoft_group_enrollments_syncing

Test plan:
- Run a new sync (make sure a group doesn't exist for your course on the
  MS side, and use mygroup.syncer_job.run_later. NOTE: after deleting a
  group it takes a couple seconds for this change to settle so you may
  want to wait a few seconds after deleting so the new sync job doesn't
  find the old group)
- While the job is sleeping/retrying, check the state and make sure
  retries_by_step is being set for whichever step is is retrying on
- Edit GraphService#team_exists? and add
  "raise MicrosoftSync::Errors::GroupHasNoOwners"
- Run another sync and make sure it is scheduling a retry for 30 seconds
  then 90, then 270. (You should be able to watch the job logs and see
  the run_at time for the job it schedules -- look for a blue INSERT
  block with 'INSERT INTO "public"."delayed_jobs"', 'object:
  !ruby/object:MicrosoftSync::StateMachineJob', and timestamps at the
  end)

Change-Id: Ic7d7f69de9e24a140daf41c54a6409b0a58f5334
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/262954
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
Reviewed-by: Wagner Goncalves <wagner.goncalves@instructure.com>
QA-Review: Wagner Goncalves <wagner.goncalves@instructure.com>
Product-Review: Evan Battaglia <ebattaglia@instructure.com>
This commit is contained in:
Evan Battaglia 2021-04-14 19:29:20 -04:00
parent a0624c197c
commit 8e2eb69e49
4 changed files with 116 additions and 38 deletions

View File

@ -186,9 +186,9 @@ module MicrosoftSync
memory_state = nil
loop do
# TODO: consider checking if group object is deleted before every step (INTEROP-6621)
log { "running step #{current_step}" }
begin
# TODO: consider checking if group object is deleted before every step (INTEROP-6621)
log { "running step #{current_step}" }
result = steps_object.send(current_step.to_sym, memory_state, job_state_data)
rescue => e
update_state_record_to_errored_and_cleanup(e)
@ -211,9 +211,7 @@ module MicrosoftSync
current_step, memory_state = result.next_step, result.memory_state
job_state_data = nil
when Retry
if update_state_record_to_retrying(result, current_step)
run_with_delay(current_step, result.delay_amount, synchronous: synchronous)
end
handle_retry(result, current_step, synchronous)
return
else
raise InternalError, "Step returned #{result.inspect}, expected COMPLETE/NextStep/Retry"
@ -236,7 +234,7 @@ module MicrosoftSync
return
end
self.delay(strand: strand, run_at: delay_amount&.from_now).run(step)
self.delay(strand: strand, run_at: delay_amount&.seconds&.from_now).run(step)
end
def update_state_record_to_errored_and_cleanup(error)
@ -247,13 +245,16 @@ module MicrosoftSync
steps_object.after_failure
end
# Returns false if workflow_state has since been set to deleted (so we
# should stop)
# Raises the error if we have passed the retry limit
# Otherwise sets the job_state to keep track of (step, data, retries)
def update_state_record_to_retrying(retry_object, current_step)
# Does nothing if workflow_state has since been set to deleted
# Otherwise sets the job_state to keep track of (step, data, retries) and
# kicks off a retry
def handle_retry(retry_object, current_step, synchronous)
job_state = job_state_record.reload.job_state
retries = job_state&.dig(:retries) || 0
retries_by_step = job_state&.dig(:retries_by_step) || {}
retries = retries_by_step[current_step.to_s] || 0
if retries >= steps_object.max_retries
update_state_record_to_errored_and_cleanup(retry_object.error)
raise retry_object.error
@ -265,14 +266,20 @@ module MicrosoftSync
step: current_step,
updated_at: Time.zone.now,
data: retry_object.job_state_data,
retries: retries + 1,
retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1),
# for debugging only:
retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}",
}
job_state_record&.update_unless_deleted(
return unless job_state_record&.update_unless_deleted(
workflow_state: :retrying, job_state: new_job_state
)
delay_amount = retry_object.delay_amount
delay_amount = delay_amount[retries] || delay_amount.last if delay_amount.is_a?(Array)
log { "handle_retry #{current_step} - #{delay_amount}" }
run_with_delay(current_step, delay_amount, synchronous: synchronous)
end
end
end

View File

@ -47,7 +47,7 @@ module MicrosoftSync
end
def max_retries
4
3
end
def restart_job_after_inactivity
@ -106,7 +106,7 @@ module MicrosoftSync
group.update! ms_group_id: group_id
StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
rescue Errors::HTTPNotFound => e
StateMachineJob::Retry.new(error: e, delay_amount: 45.seconds, job_state_data: group_id)
StateMachineJob::Retry.new(error: e, delay_amount: [5, 20, 100], job_state_data: group_id)
end
# Gets users enrolled in course, get UPNs (e.g. email addresses) for them,
@ -128,11 +128,12 @@ module MicrosoftSync
end
StateMachineJob::NextStep.new(:step_generate_diff)
rescue Errors::HTTPNotFound => e
StateMachineJob::Retry.new(error: e, delay_amount: [5, 20, 100])
end
# Get group members/owners from the API and local enrollments and calculate
# what needs to be done. This could also be combined with execute_diff()
# but is conceptually different and makes testing easier.
# what needs to be done.
def step_generate_diff(_mem_data, _job_state_data)
members = graph_service_helpers.get_group_users_aad_ids(group.ms_group_id)
owners = graph_service_helpers.get_group_users_aad_ids(group.ms_group_id, owners: true)
@ -143,6 +144,8 @@ module MicrosoftSync
end
StateMachineJob::NextStep.new(:step_execute_diff, diff)
rescue Errors::HTTPNotFound => e
StateMachineJob::Retry.new(error: e, delay_amount: [5, 20, 100])
end
# Run the API calls to add/remove users.
@ -183,7 +186,7 @@ module MicrosoftSync
# enrollments are in the DB) since we last calculated the diff added them
# in the generate_diff step. This is rare, but we can also sleep in that
# case.
StateMachineJob::Retry.new(error: e, delay_amount: 1.minute)
StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
end
def tenant

View File

@ -34,7 +34,7 @@ module MicrosoftSync
end
def max_retries
3
4
end
def restart_job_after_inactivity
@ -87,8 +87,20 @@ module MicrosoftSync
end
class StateMachineJobTestSteps2 < StateMachineJobTestStepsBase
def initialize(step_first_retries)
@step_first_retries = step_first_retries
end
def step_first(_mem_data, _job_state_data)
StateMachineJob::Retry.new(error: Errors::PublicError.new('foo')) { steps_run << :stash }
if (@step_first_retries -= 1) >= 0
StateMachineJob::Retry.new(error: Errors::PublicError.new('foo')) { steps_run << :stash }
else
StateMachineJob::NextStep.new(:step_second)
end
end
def step_second(_mem_data, _job_state_data)
StateMachineJob::Retry.new(error: Errors::PublicError.new('foo'), delay_amount: [1,2,3])
end
end
@ -188,22 +200,21 @@ module MicrosoftSync
end
describe 'retry counting' do
let(:steps_object) { StateMachineJobTestSteps2.new }
let(:steps_object) { StateMachineJobTestSteps2.new(5) }
it 'counts retries and stores in job_state' do
it 'counts retries for each step and stores in job_state' do
subject.send(:run, nil)
expect(state_record.reload.job_state[:retries]).to eq(1)
expect(state_record.reload.job_state[:retries_by_step]['step_first']).to eq(1)
subject.send(:run, :step_first)
expect(state_record.reload.job_state[:retries]).to eq(2)
expect(state_record.reload.job_state[:retries_by_step]['step_first']).to eq(2)
subject.send(:run, :step_first)
expect(state_record.reload.job_state[:retries]).to eq(3)
expect(state_record.reload.job_state[:retries_by_step]['step_first']).to eq(3)
end
context 'when the number of retries is exceeded' do
context 'when the number of retries for a step is exceeded' do
before do
subject.send(:run, nil)
subject.send(:run, :step_first)
subject.send(:run, :step_first)
3.times { subject.send(:run, :step_first) }
end
it 're-raises the error and sets the record state to errored' do
@ -216,11 +227,44 @@ module MicrosoftSync
it "doesn't run the stash block on the last failure" do
expect { subject.send(:run, :step_first) }.to raise_error(Errors::PublicError, 'foo')
expect(steps_object.steps_run.count(:stash)).to eq(3)
expect(steps_object.steps_run.count(:stash)).to eq(4)
steps_object.steps_run.clear
expect(steps_object.steps_run).to be_empty
end
end
context 'when multiple steps fail' do
let(:steps_object) { StateMachineJobTestSteps2.new(2) }
before do
subject.send(:run, nil)
2.times { subject.send(:run, :step_first) }
3.times { subject.send(:run, :step_second) }
end
it 'counts retries per-step' do
expect { subject.send(:run, :step_second) }.to raise_error(Errors::PublicError, 'foo')
expect(state_record.reload.job_state).to eq(nil)
expect(state_record.workflow_state).to eq('errored')
expect(state_record.last_error).to \
eq(Errors.user_facing_message(Errors::PublicError.new('foo')))
end
context 'when delay is an array of integers' do
it 'uses delays based on the per-step retry count' do
delays = steps_object.steps_run.select{|step| step.is_a?(Array)}
expect(delays).to eq([
[:delay_run, [{run_at: nil, strand: strand}], [:step_first]],
[:delay_run, [{run_at: nil, strand: strand}], [:step_first]],
[:delay_run, [{run_at: 1.second.from_now, strand: strand}], [:step_second]],
[:delay_run, [{run_at: 2.seconds.from_now, strand: strand}], [:step_second]],
[:delay_run, [{run_at: 3.seconds.from_now, strand: strand}], [:step_second]],
# Uses last value once past end of array:
[:delay_run, [{run_at: 3.seconds.from_now, strand: strand}], [:step_second]],
])
end
end
end
end
context 'when an unhandled error occurs' do
@ -333,14 +377,14 @@ module MicrosoftSync
it 'restarts the job (in-progress job has stalled)' do
Timecop.travel((6.hours + 1.second).from_now)
expect(state_record.job_state[:step]).to eq(:step_first)
expect(state_record.job_state[:retries]).to eq(2)
expect(state_record.job_state[:retries_by_step]['step_first']).to eq(2)
expect(steps_object).to receive(:step_first) do
expect(state_record.job_state).to eq(nil)
expect(state_record.workflow_state).to eq('running')
described_class::Retry.new(error: StandardError.new)
end
expect { subject.send(:run, nil) }.to change{ state_record.job_state[:updated_at] }
expect(state_record.job_state[:retries]).to eq(1)
expect(state_record.job_state[:retries_by_step]['step_first']).to eq(1)
end
end

View File

@ -41,7 +41,10 @@ describe MicrosoftSync::SyncerSteps do
expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry)
expect(result.error.class).to eq(error_class)
expect(result.delay_amount).to eq(delay_amount)
expect(result.delay_amount.to_i).to be < syncer_steps.restart_job_after_inactivity.to_i
# Check that we haven't specified any delays to big for our restart_job_after_inactivity
[result.delay_amount].flatten.each do |delay|
expect(delay.to_i).to be < syncer_steps.restart_job_after_inactivity.to_i
end
expect(result.job_state_data).to eq(job_state_data)
end
@ -75,7 +78,7 @@ describe MicrosoftSync::SyncerSteps do
describe '#max_retries' do
subject { syncer_steps.max_retries }
it { is_expected.to eq(4) }
it { is_expected.to eq(3) }
end
describe '#after_failure' do
@ -206,7 +209,7 @@ describe MicrosoftSync::SyncerSteps do
expect { subject }.to_not change { group.reload.ms_group_id }
expect_retry(
subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: 45.seconds, job_state_data: 'newid'
delay_amount: [5, 20, 100], job_state_data: 'newid'
)
end
end
@ -314,6 +317,16 @@ describe MicrosoftSync::SyncerSteps do
expect(upns_looked_up).to include("student1@example.com")
end
end
context 'on 404' do
it 'retries with a delay' do
expect(graph_service_helpers).to receive(:users_upns_to_aads).and_raise(new_http_error(404))
expect_retry(
subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [5, 20, 100]
)
end
end
end
describe '#step_generate_diff' do
@ -344,6 +357,17 @@ describe MicrosoftSync::SyncerSteps do
expect_next_step(subject, :step_execute_diff, diff)
expect(members_and_enrollment_types).to eq([['0', 'TeacherEnrollment']])
end
context 'on 404' do
it 'retries with a delay' do
expect(graph_service_helpers).to receive(:get_group_users_aad_ids)
.and_raise(new_http_error(404))
expect_retry(
subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [5, 20, 100]
)
end
end
end
describe '#execute_diff' do
@ -409,25 +433,25 @@ describe MicrosoftSync::SyncerSteps do
end
context 'when the Microsoft API errors with "group has no owners"' do
it "retries in 1 minute" do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid').
and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
expect_retry(
subject,
error_class: MicrosoftSync::Errors::GroupHasNoOwners,
delay_amount: 1.minute
delay_amount: [30, 90, 270]
)
end
end
context "when the Microsoft API errors with a 404 (e.g., group doesn't exist)" do
it "retries in 1 minute" do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid').and_raise(new_http_error(404))
expect_retry(
subject,
error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: 1.minute
delay_amount: [30, 90, 270]
)
end
end