MSFT Sync: wait to run steps likely to fail

* Don't try to update group properties immediately after creation, since
  it is likely to fail until the Microsoft API settles
* Don't try to create a team the immediately after group creation, since
  it is also likely to fail until the Microsoft API creates

refs INTEROP-6638
flag=microsoft_group_enrollments_syncing

Test plan:
- Delete existing groups on Microsoft side and run a syncer job. Watch
  logs to see that:
  1. it enqueues a job for step_ensure_class_group_exists (delay of 2
     seconds) before ever hitting the PATCH endpoint (you can see
     endpoints hit with logs looking like "MicrosoftSync::GraphClient:
     get ..."), and that when it does so, it updates the group to have
     workflow_state=retrying and job_state with empty retries_by_step
  2. it enqueues a job for step_create_team (delay of 10 seconds) before
     ever hitting the endpoint to create the team (post
     https://graph.microsoft.com/v1.0/teams)
- After it's done, check that the job completed successfully and a group
  and team were created on the Microsoft side.
- From a console run
  group.syncer_job.steps_object.step_check_team_exists(nil, nil)
  again and make sure it returns COMPLETE.
- From a console run
  group.syncer_job.steps_object.step_create_team(nil, nil)
  again and make sure it returns COMPLETE.
- For completeness, run a sync job on the same group again to make sure
  a sync after completion still works. This time,
  step_update_group_with_course_data and step_create_team should never
  run. If you want, you run the job synchronously to see the steps easier.

Change-Id: Ic2b697dd7cc0d1bbf75ad55d3593ee64975e3ab9
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/263012
Product-Review: Evan Battaglia <ebattaglia@instructure.com>
Reviewed-by: Wagner Goncalves <wagner.goncalves@instructure.com>
QA-Review: Wagner Goncalves <wagner.goncalves@instructure.com>
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
This commit is contained in:
Evan Battaglia 2021-04-15 14:27:32 -04:00
parent 4df7b6b1e9
commit 0baf7b10ab
7 changed files with 203 additions and 114 deletions

View File

@ -44,6 +44,7 @@ module MicrosoftSync
class InvalidRemoteState < PublicError; end
class GroupHasNoOwners < PublicError; end
class TeamAlreadyExists < PublicError; end
# Makes public the status code but not anything about the response body.
# The internal error message has the response body (truncated)
@ -57,7 +58,8 @@ module MicrosoftSync
def self.subclasses_by_status_code
@subclasses_by_status_code ||= {
400 => HTTPBadRequest,
404 => HTTPNotFound
404 => HTTPNotFound,
409 => HTTPConflict,
}
end
@ -76,5 +78,6 @@ module MicrosoftSync
class HTTPNotFound < HTTPInvalidStatus; end
class HTTPBadRequest < HTTPInvalidStatus; end
class HTTPConflict < HTTPInvalidStatus; end
end
end

View File

@ -118,11 +118,13 @@ module MicrosoftSync
}
request(:post, 'teams', body: body)
rescue MicrosoftSync::Errors::HTTPBadRequest => e
if e.response_body =~ /must have one or more owners in order to create a Team/
raise MicrosoftSync::Errors::GroupHasNoOwners
end
raise unless e.response_body =~ /must have one or more owners in order to create a Team/i
raise
raise MicrosoftSync::Errors::GroupHasNoOwners
rescue MicrosoftSync::Errors::HTTPConflict => e
raise unless e.response_body =~ /group is already provisioned/i
raise MicrosoftSync::Errors::TeamAlreadyExists
end
# === Users ===

View File

@ -92,13 +92,22 @@ module MicrosoftSync
# Signals that this step of the job succeeded.
# memory_state is passed into the next step.
class NextStep
attr_reader :next_step, :memory_state
def initialize(next_step, memory_state=nil)
@next_step = next_step or raise InternalError
attr_reader :step, :memory_state
def initialize(step, memory_state=nil)
@step = step or raise InternalError
@memory_state = memory_state
end
end
class DelayedNextStep
attr_reader :step, :delay_amount, :job_state_data
def initialize(step, delay_amount, job_state_data=nil)
@step = step or raise InternalError
@delay_amount = delay_amount
@job_state_data = job_state_data
end
end
class Complete; end
# Return this when your job is done:
@ -152,7 +161,7 @@ module MicrosoftSync
step_from_job_state = job_state&.dig(:step)
# Normal case: job continuation, or new job (step==nil) and no other job in-progress
if step == step_from_job_state
if step&.to_s == step_from_job_state&.to_s
return run_main_loop(step, job_state&.dig(:data), synchronous)
end
@ -208,8 +217,11 @@ module MicrosoftSync
steps_object.after_complete
return
when NextStep
current_step, memory_state = result.next_step, result.memory_state
current_step, memory_state = result.step, result.memory_state
job_state_data = nil
when DelayedNextStep
handle_delayed_next_step(result, synchronous)
return
when Retry
handle_retry(result, current_step, synchronous)
return
@ -245,6 +257,25 @@ module MicrosoftSync
steps_object.after_failure
end
def update_state_record_to_retrying(new_job_state)
job_state_record&.update_unless_deleted(
workflow_state: :retrying,
job_state: new_job_state.merge(updated_at: Time.zone.now)
)
end
def handle_delayed_next_step(delayed_next_step, synchronous)
return unless update_state_record_to_retrying(
step: delayed_next_step.step,
data: delayed_next_step.job_state_data,
retries_by_step: job_state_record.reload.job_state&.dig(:retries_by_step),
)
run_with_delay(
delayed_next_step.step, delayed_next_step.delay_amount, synchronous: synchronous
)
end
# Raises the error if we have passed the retry limit
# Does nothing if workflow_state has since been set to deleted
# Otherwise sets the job_state to keep track of (step, data, retries) and
@ -262,17 +293,12 @@ module MicrosoftSync
retry_object.stash_block&.call
new_job_state = {
return unless update_state_record_to_retrying(
step: current_step,
updated_at: Time.zone.now,
data: retry_object.job_state_data,
retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1),
# for debugging only:
retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}",
}
return unless job_state_record&.update_unless_deleted(
workflow_state: :retrying, job_state: new_job_state
)
delay_amount = retry_object.delay_amount

View File

@ -97,11 +97,12 @@ module MicrosoftSync
new_group_id = graph_service_helpers.create_education_class(course)['id']
end
StateMachineJob::NextStep.new(:step_update_group_with_course_data, new_group_id)
StateMachineJob::DelayedNextStep.new(
:step_update_group_with_course_data, 2.seconds, new_group_id
)
end
def step_update_group_with_course_data(group_id_from_mem, group_id_from_state)
group_id = group_id_from_mem || group_id_from_state
def step_update_group_with_course_data(_mem_state, group_id)
graph_service_helpers.update_group_with_course_data(group_id, course)
group.update! ms_group_id: group_id
StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
@ -168,24 +169,31 @@ module MicrosoftSync
graph_service.remove_group_owner(group.ms_group_id, aad)
end
StateMachineJob::NextStep.new(:step_ensure_team_exists)
StateMachineJob::NextStep.new(:step_check_team_exists)
end
def step_ensure_team_exists(_mem_data, _job_state_data)
def step_check_team_exists(_mem_data, _job_state_data)
if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \
&& !graph_service.team_exists?(group.ms_group_id)
graph_service.create_education_class_team(group.ms_group_id)
StateMachineJob::DelayedNextStep.new(:step_create_team, 10.seconds)
else
StateMachineJob::COMPLETE
end
end
def step_create_team(_mem_data, _job_state_data)
graph_service.create_education_class_team(group.ms_group_id)
StateMachineJob::COMPLETE
rescue MicrosoftSync::Errors::TeamAlreadyExists
StateMachineJob::COMPLETE
rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e
# API is eventually consistent: We often have to wait a couple minutes
# after creating the group and adding owners for the Teams API to see the
# group and owners.
# It's also possible for the course to have added owners (so the
# enrollments are in the DB) since we last calculated the diff added them
# enrollments are in the DB) since we last calculated the diff and added them
# in the generate_diff step. This is rare, but we can also sleep in that
# case.
# case. We'll eventually fail but the team will be created next time we sync.
StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
end

View File

@ -59,14 +59,16 @@ describe MicrosoftSync::GraphService do
end
end
context 'with a non-200 status code' do
let(:response) { json_response(403, error: {message: 'uh-oh!'}) }
[400, 403, 409].each do |code|
context "with a #{code} status code" do
let(:response) { json_response(code, error: {message: 'uh-oh!'}) }
it 'raises an HTTPInvalidStatus with the code and message' do
expect { subject }.to raise_error(
MicrosoftSync::Errors::HTTPInvalidStatus,
/Graph service returned 403 for tenant mytenant.*uh-oh!/
)
it 'raises an HTTPInvalidStatus with the code and message' do
expect { subject }.to raise_error(
MicrosoftSync::Errors::HTTPInvalidStatus,
/Graph service returned #{code} for tenant mytenant.*uh-oh!/
)
end
end
end
end
@ -394,6 +396,19 @@ describe MicrosoftSync::GraphService do
expect { subject }.to raise_error(MicrosoftSync::Errors::GroupHasNoOwners)
end
end
context 'when Microsoft returns a 409 saying "group is already provisioned"' do
let(:response) do
{
status: 409,
body: "{\r\n \"error\": {\r\n \"code\": \"Conflict\",\r\n \"message\": \"Failed to execute Templates backend request CreateTeamFromGroupWithTemplateRequest. Request Url: https://teams.microsoft.com/fabric/amer/templates/api/groups/16786176-b111-1111-1111-111111111110/team, Request Method: PUT, Response Status Code: Conflict, Response Headers: Strict-Transport-Security: max-age=2592000\\r\\nx-operationid: 11111111111111111111111111111111\\r\\nx-telemetryid: 00-11111111111111111111111111111111-111111111111111b-00\\r\\nX-MSEdge-Ref: Ref A: 11111111111111111111111111111111 Ref B: BLUEDGE1111 Ref C: 2021-04-15T23:08:28Z\\r\\nDate: Thu, 15 Apr 2021 23:08:28 GMT\\r\\n, ErrorMessage : {\\\"errors\\\":[{\\\"message\\\":\\\"The group is already provisioned\\\",\\\"errorCode\\\":\\\"Unknown\\\"}],\\\"operationId\\\":\\\"11111111111111111111111111111111\\\"}\",\r\n \"innerError\": {\r\n \"date\": \"2021-04-15T23:08:28\",\r\n \"request-id\": \"11111111-1111-1111-1111-111111111111\",\r\n \"client-request-id\": \"11111111-1111-1111-1111-111111111111\"\r\n }\r\n }\r\n}"
}
end
it 'raises a TeamAlreadyExists error' do
expect { subject }.to raise_error(MicrosoftSync::Errors::TeamAlreadyExists)
end
end
end
describe '#list_users' do

View File

@ -267,6 +267,36 @@ module MicrosoftSync
end
end
context 'when the step returns a DelayedNextStep' do
before do
subject.send(:run, nil)
allow(steps_object).to receive(:step_first).and_return \
described_class::DelayedNextStep.new(:step_second, 1.minute, 'abc123')
steps_object.steps_run.clear
end
it 'enqueues a job for that step' do
subject.send(:run, :step_first)
expect(steps_object.steps_run).to eq([
[:delay_run, [{run_at: 1.minute.from_now, strand: strand}], [:step_second]],
])
end
it 'leaves retries_by_step untouched' do
expect { subject.send(:run, :step_first) }.not_to \
change { state_record.reload.job_state[:retries_by_step] }.from('step_first' => 1)
end
it 'sets job_state step, updated_at, and data' do
Timecop.freeze(2.minutes.from_now) do
expect { subject.send(:run, :step_first) }
.to change { state_record.reload.job_state[:data] }.to('abc123')
.and change { state_record.reload.job_state[:step] }.to(:step_second)
.and change { state_record.reload.job_state[:updated_at] }.to(Time.zone.now)
end
end
end
context 'when an unhandled error occurs' do
it 'bubbles up the error, sets the record state to errored, and calls after_failure' do
subject.send(:run, nil)

View File

@ -30,13 +30,21 @@ describe MicrosoftSync::SyncerSteps do
let(:tenant) { 'mytenant123' }
let(:sync_enabled) { true }
def expect_next_step(result, next_step, memory_state=nil)
def expect_next_step(result, step, memory_state=nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::NextStep)
expect { syncer_steps.method(next_step.to_sym) }.to_not raise_error
expect(result.next_step).to eq(next_step)
expect { syncer_steps.method(step.to_sym) }.to_not raise_error
expect(result.step).to eq(step)
expect(result.memory_state).to eq(memory_state)
end
def expect_delayed_next_step(result, step, delay_amount, job_state_data=nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::DelayedNextStep)
expect { syncer_steps.method(step.to_sym) }.to_not raise_error
expect(result.step).to eq(step)
expect(result.delay_amount).to eq(delay_amount)
expect(result.job_state_data).to eq(job_state_data)
end
def expect_retry(result, error_class:, delay_amount: nil, job_state_data: nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry)
expect(result.error.class).to eq(error_class)
@ -116,7 +124,9 @@ describe MicrosoftSync::SyncerSteps do
expect(graph_service_helpers).to \
receive(:create_education_class).with(course).and_return('id' => 'newid')
expect_next_step(subject, :step_update_group_with_course_data, 'newid')
expect_delayed_next_step(
subject, :step_update_group_with_course_data, 2.seconds, 'newid'
)
end
end
@ -126,7 +136,9 @@ describe MicrosoftSync::SyncerSteps do
it 'goes to the "update group" step with the remote group ID' do
expect(graph_service_helpers).to_not receive(:create_education_class)
expect_next_step(subject, :step_update_group_with_course_data, 'newid2')
expect_delayed_next_step(
subject, :step_update_group_with_course_data, 2.seconds, 'newid2'
)
end
end
@ -190,52 +202,40 @@ describe MicrosoftSync::SyncerSteps do
end
describe '#step_update_group_with_course_data' do
shared_examples_for 'updating group with course data' do
context 'on success' do
it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
subject do
syncer_steps.step_update_group_with_course_data(nil, 'newid')
end
expect { subject }.to change { group.reload.ms_group_id }.to('newid')
expect_next_step(subject, :step_ensure_enrollments_user_mappings_filled)
end
end
context 'on success' do
it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
context 'on 404' do
it 'retries with a delay' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
.and_raise(new_http_error(404))
expect { subject }.to_not change { group.reload.ms_group_id }
expect_retry(
subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [5, 20, 100], job_state_data: 'newid'
)
end
end
context 'on other failure' do
it 'bubbles up the error' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
.and_raise(new_http_error(400))
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
end
expect { subject }.to change { group.reload.ms_group_id }.to('newid')
expect_next_step(subject, :step_ensure_enrollments_user_mappings_filled)
end
end
context 'when run for the first time with a new_group_id' do
subject { syncer_steps.step_update_group_with_course_data('newid', nil) }
it_behaves_like 'updating group with course data'
context 'on 404' do
it 'retries with a delay' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
.and_raise(new_http_error(404))
expect { subject }.to_not change { group.reload.ms_group_id }
expect_retry(
subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [5, 20, 100], job_state_data: 'newid'
)
end
end
context 'when retrying' do
subject do
syncer_steps.step_update_group_with_course_data(nil, 'newid')
context 'on other failure' do
it 'bubbles up the error' do
expect(graph_service_helpers).to \
receive(:update_group_with_course_data).with('newid', course)
.and_raise(new_http_error(400))
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
end
it_behaves_like 'updating group with course data'
end
end
@ -370,7 +370,7 @@ describe MicrosoftSync::SyncerSteps do
end
end
describe '#execute_diff' do
describe '#step_execute_diff' do
subject { syncer_steps.step_execute_diff(diff, nil) }
let(:diff) { double('MembershipDiff') }
@ -394,12 +394,12 @@ describe MicrosoftSync::SyncerSteps do
expect(graph_service).to \
receive(:add_users_to_group).with('mygroup', members: %w[o3])
expect_next_step(subject, :step_ensure_team_exists)
expect_next_step(subject, :step_check_team_exists)
end
end
describe '#ensure_team_exists' do
subject { syncer_steps.step_ensure_team_exists(nil, nil) }
describe '#step_check_team_exists' do
subject { syncer_steps.step_check_team_exists(nil, nil) }
before { group.update!(ms_group_id: 'mygroupid') }
@ -409,60 +409,65 @@ describe MicrosoftSync::SyncerSteps do
e.destroy if e.type =~ /^Teacher|Ta|Designer/
end
expect(graph_service).to_not receive(:team_exists?)
expect(graph_service).to_not receive(:create_team)
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
end
end
context 'when the team already exists' do
it "doesn't create a team" do
it "returns COMPLETE" do
expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(true)
expect(graph_service).to_not receive(:create_team)
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
end
end
context "when the team doesn't exist" do
before do
allow(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
it "moves on to step_create_team after a delay" do
expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
expect_delayed_next_step(subject, :step_create_team, 10.seconds)
end
end
end
it 'creates the team' do
describe '#step_create_team' do
subject { syncer_steps.step_create_team(nil, nil) }
before { group.update!(ms_group_id: 'mygroupid') }
it 'creates the team' do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
end
context 'when the Microsoft API errors with "group has no owners"' do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
.and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
expect_retry(
subject,
error_class: MicrosoftSync::Errors::GroupHasNoOwners,
delay_amount: [30, 90, 270]
)
end
end
context 'when the Microsoft API errors with "group has no owners"' do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid').
and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
expect_retry(
subject,
error_class: MicrosoftSync::Errors::GroupHasNoOwners,
delay_amount: [30, 90, 270]
)
end
context "when the Microsoft API errors with a 404 (e.g., group doesn't exist)" do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid').and_raise(new_http_error(404))
expect_retry(
subject,
error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [30, 90, 270]
)
end
end
context "when the Microsoft API errors with a 404 (e.g., group doesn't exist)" do
it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid').and_raise(new_http_error(404))
expect_retry(
subject,
error_class: MicrosoftSync::Errors::HTTPNotFound,
delay_amount: [30, 90, 270]
)
end
end
context 'when the Microsoft API errors with some other error' do
it "bubbles up the error" do
expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid').
and_raise(new_http_error(400))
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
end
context 'when the Microsoft API errors with some other error' do
it "bubbles up the error" do
expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid')
.and_raise(new_http_error(400))
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
end
end
end