MSFT Sync: wait to run steps likely to fail

* Don't try to update group properties immediately after creation, since
  it is likely to fail until the Microsoft API settles
* Don't try to create a team the immediately after group creation, since
  it is also likely to fail until the Microsoft API creates

refs INTEROP-6638
flag=microsoft_group_enrollments_syncing

Test plan:
- Delete existing groups on Microsoft side and run a syncer job. Watch
  logs to see that:
  1. it enqueues a job for step_ensure_class_group_exists (delay of 2
     seconds) before ever hitting the PATCH endpoint (you can see
     endpoints hit with logs looking like "MicrosoftSync::GraphClient:
     get ..."), and that when it does so, it updates the group to have
     workflow_state=retrying and job_state with empty retries_by_step
  2. it enqueues a job for step_create_team (delay of 10 seconds) before
     ever hitting the endpoint to create the team (post
     https://graph.microsoft.com/v1.0/teams)
- After it's done, check that the job completed successfully and a group
  and team were created on the Microsoft side.
- From a console run
  group.syncer_job.steps_object.step_check_team_exists(nil, nil)
  again and make sure it returns COMPLETE.
- From a console run
  group.syncer_job.steps_object.step_create_team(nil, nil)
  again and make sure it returns COMPLETE.
- For completeness, run a sync job on the same group again to make sure
  a sync after completion still works. This time,
  step_update_group_with_course_data and step_create_team should never
  run. If you want, you run the job synchronously to see the steps easier.

Change-Id: Ic2b697dd7cc0d1bbf75ad55d3593ee64975e3ab9
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/263012
Product-Review: Evan Battaglia <ebattaglia@instructure.com>
Reviewed-by: Wagner Goncalves <wagner.goncalves@instructure.com>
QA-Review: Wagner Goncalves <wagner.goncalves@instructure.com>
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
This commit is contained in:
Evan Battaglia 2021-04-15 14:27:32 -04:00
parent 4df7b6b1e9
commit 0baf7b10ab
7 changed files with 203 additions and 114 deletions

View File

@ -44,6 +44,7 @@ module MicrosoftSync
class InvalidRemoteState < PublicError; end class InvalidRemoteState < PublicError; end
class GroupHasNoOwners < PublicError; end class GroupHasNoOwners < PublicError; end
class TeamAlreadyExists < PublicError; end
# Makes public the status code but not anything about the response body. # Makes public the status code but not anything about the response body.
# The internal error message has the response body (truncated) # The internal error message has the response body (truncated)
@ -57,7 +58,8 @@ module MicrosoftSync
def self.subclasses_by_status_code def self.subclasses_by_status_code
@subclasses_by_status_code ||= { @subclasses_by_status_code ||= {
400 => HTTPBadRequest, 400 => HTTPBadRequest,
404 => HTTPNotFound 404 => HTTPNotFound,
409 => HTTPConflict,
} }
end end
@ -76,5 +78,6 @@ module MicrosoftSync
class HTTPNotFound < HTTPInvalidStatus; end class HTTPNotFound < HTTPInvalidStatus; end
class HTTPBadRequest < HTTPInvalidStatus; end class HTTPBadRequest < HTTPInvalidStatus; end
class HTTPConflict < HTTPInvalidStatus; end
end end
end end

View File

@ -118,11 +118,13 @@ module MicrosoftSync
} }
request(:post, 'teams', body: body) request(:post, 'teams', body: body)
rescue MicrosoftSync::Errors::HTTPBadRequest => e rescue MicrosoftSync::Errors::HTTPBadRequest => e
if e.response_body =~ /must have one or more owners in order to create a Team/ raise unless e.response_body =~ /must have one or more owners in order to create a Team/i
raise MicrosoftSync::Errors::GroupHasNoOwners
end
raise raise MicrosoftSync::Errors::GroupHasNoOwners
rescue MicrosoftSync::Errors::HTTPConflict => e
raise unless e.response_body =~ /group is already provisioned/i
raise MicrosoftSync::Errors::TeamAlreadyExists
end end
# === Users === # === Users ===

View File

@ -92,13 +92,22 @@ module MicrosoftSync
# Signals that this step of the job succeeded. # Signals that this step of the job succeeded.
# memory_state is passed into the next step. # memory_state is passed into the next step.
class NextStep class NextStep
attr_reader :next_step, :memory_state attr_reader :step, :memory_state
def initialize(next_step, memory_state=nil) def initialize(step, memory_state=nil)
@next_step = next_step or raise InternalError @step = step or raise InternalError
@memory_state = memory_state @memory_state = memory_state
end end
end end
class DelayedNextStep
attr_reader :step, :delay_amount, :job_state_data
def initialize(step, delay_amount, job_state_data=nil)
@step = step or raise InternalError
@delay_amount = delay_amount
@job_state_data = job_state_data
end
end
class Complete; end class Complete; end
# Return this when your job is done: # Return this when your job is done:
@ -152,7 +161,7 @@ module MicrosoftSync
step_from_job_state = job_state&.dig(:step) step_from_job_state = job_state&.dig(:step)
# Normal case: job continuation, or new job (step==nil) and no other job in-progress # Normal case: job continuation, or new job (step==nil) and no other job in-progress
if step == step_from_job_state if step&.to_s == step_from_job_state&.to_s
return run_main_loop(step, job_state&.dig(:data), synchronous) return run_main_loop(step, job_state&.dig(:data), synchronous)
end end
@ -208,8 +217,11 @@ module MicrosoftSync
steps_object.after_complete steps_object.after_complete
return return
when NextStep when NextStep
current_step, memory_state = result.next_step, result.memory_state current_step, memory_state = result.step, result.memory_state
job_state_data = nil job_state_data = nil
when DelayedNextStep
handle_delayed_next_step(result, synchronous)
return
when Retry when Retry
handle_retry(result, current_step, synchronous) handle_retry(result, current_step, synchronous)
return return
@ -245,6 +257,25 @@ module MicrosoftSync
steps_object.after_failure steps_object.after_failure
end end
def update_state_record_to_retrying(new_job_state)
job_state_record&.update_unless_deleted(
workflow_state: :retrying,
job_state: new_job_state.merge(updated_at: Time.zone.now)
)
end
def handle_delayed_next_step(delayed_next_step, synchronous)
return unless update_state_record_to_retrying(
step: delayed_next_step.step,
data: delayed_next_step.job_state_data,
retries_by_step: job_state_record.reload.job_state&.dig(:retries_by_step),
)
run_with_delay(
delayed_next_step.step, delayed_next_step.delay_amount, synchronous: synchronous
)
end
# Raises the error if we have passed the retry limit # Raises the error if we have passed the retry limit
# Does nothing if workflow_state has since been set to deleted # Does nothing if workflow_state has since been set to deleted
# Otherwise sets the job_state to keep track of (step, data, retries) and # Otherwise sets the job_state to keep track of (step, data, retries) and
@ -262,17 +293,12 @@ module MicrosoftSync
retry_object.stash_block&.call retry_object.stash_block&.call
new_job_state = { return unless update_state_record_to_retrying(
step: current_step, step: current_step,
updated_at: Time.zone.now,
data: retry_object.job_state_data, data: retry_object.job_state_data,
retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1), retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1),
# for debugging only: # for debugging only:
retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}", retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}",
}
return unless job_state_record&.update_unless_deleted(
workflow_state: :retrying, job_state: new_job_state
) )
delay_amount = retry_object.delay_amount delay_amount = retry_object.delay_amount

View File

@ -97,11 +97,12 @@ module MicrosoftSync
new_group_id = graph_service_helpers.create_education_class(course)['id'] new_group_id = graph_service_helpers.create_education_class(course)['id']
end end
StateMachineJob::NextStep.new(:step_update_group_with_course_data, new_group_id) StateMachineJob::DelayedNextStep.new(
:step_update_group_with_course_data, 2.seconds, new_group_id
)
end end
def step_update_group_with_course_data(group_id_from_mem, group_id_from_state) def step_update_group_with_course_data(_mem_state, group_id)
group_id = group_id_from_mem || group_id_from_state
graph_service_helpers.update_group_with_course_data(group_id, course) graph_service_helpers.update_group_with_course_data(group_id, course)
group.update! ms_group_id: group_id group.update! ms_group_id: group_id
StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled) StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
@ -168,24 +169,31 @@ module MicrosoftSync
graph_service.remove_group_owner(group.ms_group_id, aad) graph_service.remove_group_owner(group.ms_group_id, aad)
end end
StateMachineJob::NextStep.new(:step_ensure_team_exists) StateMachineJob::NextStep.new(:step_check_team_exists)
end end
def step_ensure_team_exists(_mem_data, _job_state_data) def step_check_team_exists(_mem_data, _job_state_data)
if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \ if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \
&& !graph_service.team_exists?(group.ms_group_id) && !graph_service.team_exists?(group.ms_group_id)
graph_service.create_education_class_team(group.ms_group_id) StateMachineJob::DelayedNextStep.new(:step_create_team, 10.seconds)
else
StateMachineJob::COMPLETE
end
end end
def step_create_team(_mem_data, _job_state_data)
graph_service.create_education_class_team(group.ms_group_id)
StateMachineJob::COMPLETE
rescue MicrosoftSync::Errors::TeamAlreadyExists
StateMachineJob::COMPLETE StateMachineJob::COMPLETE
rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e
# API is eventually consistent: We often have to wait a couple minutes # API is eventually consistent: We often have to wait a couple minutes
# after creating the group and adding owners for the Teams API to see the # after creating the group and adding owners for the Teams API to see the
# group and owners. # group and owners.
# It's also possible for the course to have added owners (so the # It's also possible for the course to have added owners (so the
# enrollments are in the DB) since we last calculated the diff added them # enrollments are in the DB) since we last calculated the diff and added them
# in the generate_diff step. This is rare, but we can also sleep in that # in the generate_diff step. This is rare, but we can also sleep in that
# case. # case. We'll eventually fail but the team will be created next time we sync.
StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270]) StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
end end

View File

@ -59,17 +59,19 @@ describe MicrosoftSync::GraphService do
end end
end end
context 'with a non-200 status code' do [400, 403, 409].each do |code|
let(:response) { json_response(403, error: {message: 'uh-oh!'}) } context "with a #{code} status code" do
let(:response) { json_response(code, error: {message: 'uh-oh!'}) }
it 'raises an HTTPInvalidStatus with the code and message' do it 'raises an HTTPInvalidStatus with the code and message' do
expect { subject }.to raise_error( expect { subject }.to raise_error(
MicrosoftSync::Errors::HTTPInvalidStatus, MicrosoftSync::Errors::HTTPInvalidStatus,
/Graph service returned 403 for tenant mytenant.*uh-oh!/ /Graph service returned #{code} for tenant mytenant.*uh-oh!/
) )
end end
end end
end end
end
shared_examples_for 'a paginated list endpoint' do shared_examples_for 'a paginated list endpoint' do
subject { service.send(method_name, *method_args) } subject { service.send(method_name, *method_args) }
@ -394,6 +396,19 @@ describe MicrosoftSync::GraphService do
expect { subject }.to raise_error(MicrosoftSync::Errors::GroupHasNoOwners) expect { subject }.to raise_error(MicrosoftSync::Errors::GroupHasNoOwners)
end end
end end
context 'when Microsoft returns a 409 saying "group is already provisioned"' do
let(:response) do
{
status: 409,
body: "{\r\n \"error\": {\r\n \"code\": \"Conflict\",\r\n \"message\": \"Failed to execute Templates backend request CreateTeamFromGroupWithTemplateRequest. Request Url: https://teams.microsoft.com/fabric/amer/templates/api/groups/16786176-b111-1111-1111-111111111110/team, Request Method: PUT, Response Status Code: Conflict, Response Headers: Strict-Transport-Security: max-age=2592000\\r\\nx-operationid: 11111111111111111111111111111111\\r\\nx-telemetryid: 00-11111111111111111111111111111111-111111111111111b-00\\r\\nX-MSEdge-Ref: Ref A: 11111111111111111111111111111111 Ref B: BLUEDGE1111 Ref C: 2021-04-15T23:08:28Z\\r\\nDate: Thu, 15 Apr 2021 23:08:28 GMT\\r\\n, ErrorMessage : {\\\"errors\\\":[{\\\"message\\\":\\\"The group is already provisioned\\\",\\\"errorCode\\\":\\\"Unknown\\\"}],\\\"operationId\\\":\\\"11111111111111111111111111111111\\\"}\",\r\n \"innerError\": {\r\n \"date\": \"2021-04-15T23:08:28\",\r\n \"request-id\": \"11111111-1111-1111-1111-111111111111\",\r\n \"client-request-id\": \"11111111-1111-1111-1111-111111111111\"\r\n }\r\n }\r\n}"
}
end
it 'raises a TeamAlreadyExists error' do
expect { subject }.to raise_error(MicrosoftSync::Errors::TeamAlreadyExists)
end
end
end end
describe '#list_users' do describe '#list_users' do

View File

@ -267,6 +267,36 @@ module MicrosoftSync
end end
end end
context 'when the step returns a DelayedNextStep' do
before do
subject.send(:run, nil)
allow(steps_object).to receive(:step_first).and_return \
described_class::DelayedNextStep.new(:step_second, 1.minute, 'abc123')
steps_object.steps_run.clear
end
it 'enqueues a job for that step' do
subject.send(:run, :step_first)
expect(steps_object.steps_run).to eq([
[:delay_run, [{run_at: 1.minute.from_now, strand: strand}], [:step_second]],
])
end
it 'leaves retries_by_step untouched' do
expect { subject.send(:run, :step_first) }.not_to \
change { state_record.reload.job_state[:retries_by_step] }.from('step_first' => 1)
end
it 'sets job_state step, updated_at, and data' do
Timecop.freeze(2.minutes.from_now) do
expect { subject.send(:run, :step_first) }
.to change { state_record.reload.job_state[:data] }.to('abc123')
.and change { state_record.reload.job_state[:step] }.to(:step_second)
.and change { state_record.reload.job_state[:updated_at] }.to(Time.zone.now)
end
end
end
context 'when an unhandled error occurs' do context 'when an unhandled error occurs' do
it 'bubbles up the error, sets the record state to errored, and calls after_failure' do it 'bubbles up the error, sets the record state to errored, and calls after_failure' do
subject.send(:run, nil) subject.send(:run, nil)

View File

@ -30,13 +30,21 @@ describe MicrosoftSync::SyncerSteps do
let(:tenant) { 'mytenant123' } let(:tenant) { 'mytenant123' }
let(:sync_enabled) { true } let(:sync_enabled) { true }
def expect_next_step(result, next_step, memory_state=nil) def expect_next_step(result, step, memory_state=nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::NextStep) expect(result).to be_a(MicrosoftSync::StateMachineJob::NextStep)
expect { syncer_steps.method(next_step.to_sym) }.to_not raise_error expect { syncer_steps.method(step.to_sym) }.to_not raise_error
expect(result.next_step).to eq(next_step) expect(result.step).to eq(step)
expect(result.memory_state).to eq(memory_state) expect(result.memory_state).to eq(memory_state)
end end
def expect_delayed_next_step(result, step, delay_amount, job_state_data=nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::DelayedNextStep)
expect { syncer_steps.method(step.to_sym) }.to_not raise_error
expect(result.step).to eq(step)
expect(result.delay_amount).to eq(delay_amount)
expect(result.job_state_data).to eq(job_state_data)
end
def expect_retry(result, error_class:, delay_amount: nil, job_state_data: nil) def expect_retry(result, error_class:, delay_amount: nil, job_state_data: nil)
expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry) expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry)
expect(result.error.class).to eq(error_class) expect(result.error.class).to eq(error_class)
@ -116,7 +124,9 @@ describe MicrosoftSync::SyncerSteps do
expect(graph_service_helpers).to \ expect(graph_service_helpers).to \
receive(:create_education_class).with(course).and_return('id' => 'newid') receive(:create_education_class).with(course).and_return('id' => 'newid')
expect_next_step(subject, :step_update_group_with_course_data, 'newid') expect_delayed_next_step(
subject, :step_update_group_with_course_data, 2.seconds, 'newid'
)
end end
end end
@ -126,7 +136,9 @@ describe MicrosoftSync::SyncerSteps do
it 'goes to the "update group" step with the remote group ID' do it 'goes to the "update group" step with the remote group ID' do
expect(graph_service_helpers).to_not receive(:create_education_class) expect(graph_service_helpers).to_not receive(:create_education_class)
expect_next_step(subject, :step_update_group_with_course_data, 'newid2') expect_delayed_next_step(
subject, :step_update_group_with_course_data, 2.seconds, 'newid2'
)
end end
end end
@ -190,7 +202,10 @@ describe MicrosoftSync::SyncerSteps do
end end
describe '#step_update_group_with_course_data' do describe '#step_update_group_with_course_data' do
shared_examples_for 'updating group with course data' do subject do
syncer_steps.step_update_group_with_course_data(nil, 'newid')
end
context 'on success' do context 'on success' do
it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
expect(graph_service_helpers).to \ expect(graph_service_helpers).to \
@ -224,21 +239,6 @@ describe MicrosoftSync::SyncerSteps do
end end
end end
context 'when run for the first time with a new_group_id' do
subject { syncer_steps.step_update_group_with_course_data('newid', nil) }
it_behaves_like 'updating group with course data'
end
context 'when retrying' do
subject do
syncer_steps.step_update_group_with_course_data(nil, 'newid')
end
it_behaves_like 'updating group with course data'
end
end
describe '#step_ensure_enrollments_user_mappings_filled' do describe '#step_ensure_enrollments_user_mappings_filled' do
subject { syncer_steps.step_ensure_enrollments_user_mappings_filled(nil, nil) } subject { syncer_steps.step_ensure_enrollments_user_mappings_filled(nil, nil) }
@ -370,7 +370,7 @@ describe MicrosoftSync::SyncerSteps do
end end
end end
describe '#execute_diff' do describe '#step_execute_diff' do
subject { syncer_steps.step_execute_diff(diff, nil) } subject { syncer_steps.step_execute_diff(diff, nil) }
let(:diff) { double('MembershipDiff') } let(:diff) { double('MembershipDiff') }
@ -394,12 +394,12 @@ describe MicrosoftSync::SyncerSteps do
expect(graph_service).to \ expect(graph_service).to \
receive(:add_users_to_group).with('mygroup', members: %w[o3]) receive(:add_users_to_group).with('mygroup', members: %w[o3])
expect_next_step(subject, :step_ensure_team_exists) expect_next_step(subject, :step_check_team_exists)
end end
end end
describe '#ensure_team_exists' do describe '#step_check_team_exists' do
subject { syncer_steps.step_ensure_team_exists(nil, nil) } subject { syncer_steps.step_check_team_exists(nil, nil) }
before { group.update!(ms_group_id: 'mygroupid') } before { group.update!(ms_group_id: 'mygroupid') }
@ -409,23 +409,29 @@ describe MicrosoftSync::SyncerSteps do
e.destroy if e.type =~ /^Teacher|Ta|Designer/ e.destroy if e.type =~ /^Teacher|Ta|Designer/
end end
expect(graph_service).to_not receive(:team_exists?) expect(graph_service).to_not receive(:team_exists?)
expect(graph_service).to_not receive(:create_team)
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE) expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
end end
end end
context 'when the team already exists' do context 'when the team already exists' do
it "doesn't create a team" do it "returns COMPLETE" do
expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(true) expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(true)
expect(graph_service).to_not receive(:create_team)
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE) expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
end end
end end
context "when the team doesn't exist" do context "when the team doesn't exist" do
before do it "moves on to step_create_team after a delay" do
allow(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false) expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
expect_delayed_next_step(subject, :step_create_team, 10.seconds)
end end
end
end
describe '#step_create_team' do
subject { syncer_steps.step_create_team(nil, nil) }
before { group.update!(ms_group_id: 'mygroupid') }
it 'creates the team' do it 'creates the team' do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid') expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
@ -434,8 +440,8 @@ describe MicrosoftSync::SyncerSteps do
context 'when the Microsoft API errors with "group has no owners"' do context 'when the Microsoft API errors with "group has no owners"' do
it "retries in (30, 90, 270 seconds)" do it "retries in (30, 90, 270 seconds)" do
expect(graph_service).to receive(:create_education_class_team).with('mygroupid'). expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
and_raise(MicrosoftSync::Errors::GroupHasNoOwners) .and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
expect_retry( expect_retry(
subject, subject,
error_class: MicrosoftSync::Errors::GroupHasNoOwners, error_class: MicrosoftSync::Errors::GroupHasNoOwners,
@ -459,11 +465,10 @@ describe MicrosoftSync::SyncerSteps do
context 'when the Microsoft API errors with some other error' do context 'when the Microsoft API errors with some other error' do
it "bubbles up the error" do it "bubbles up the error" do
expect(graph_service).to \ expect(graph_service).to \
receive(:create_education_class_team).with('mygroupid'). receive(:create_education_class_team).with('mygroupid')
and_raise(new_http_error(400)) .and_raise(new_http_error(400))
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest) expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
end end
end end
end end
end end
end