MSFT Sync: wait to run steps likely to fail
* Don't try to update group properties immediately after creation, since it is likely to fail until the Microsoft API settles * Don't try to create a team the immediately after group creation, since it is also likely to fail until the Microsoft API creates refs INTEROP-6638 flag=microsoft_group_enrollments_syncing Test plan: - Delete existing groups on Microsoft side and run a syncer job. Watch logs to see that: 1. it enqueues a job for step_ensure_class_group_exists (delay of 2 seconds) before ever hitting the PATCH endpoint (you can see endpoints hit with logs looking like "MicrosoftSync::GraphClient: get ..."), and that when it does so, it updates the group to have workflow_state=retrying and job_state with empty retries_by_step 2. it enqueues a job for step_create_team (delay of 10 seconds) before ever hitting the endpoint to create the team (post https://graph.microsoft.com/v1.0/teams) - After it's done, check that the job completed successfully and a group and team were created on the Microsoft side. - From a console run group.syncer_job.steps_object.step_check_team_exists(nil, nil) again and make sure it returns COMPLETE. - From a console run group.syncer_job.steps_object.step_create_team(nil, nil) again and make sure it returns COMPLETE. - For completeness, run a sync job on the same group again to make sure a sync after completion still works. This time, step_update_group_with_course_data and step_create_team should never run. If you want, you run the job synchronously to see the steps easier. Change-Id: Ic2b697dd7cc0d1bbf75ad55d3593ee64975e3ab9 Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/263012 Product-Review: Evan Battaglia <ebattaglia@instructure.com> Reviewed-by: Wagner Goncalves <wagner.goncalves@instructure.com> QA-Review: Wagner Goncalves <wagner.goncalves@instructure.com> Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
This commit is contained in:
parent
4df7b6b1e9
commit
0baf7b10ab
|
@ -44,6 +44,7 @@ module MicrosoftSync
|
|||
|
||||
class InvalidRemoteState < PublicError; end
|
||||
class GroupHasNoOwners < PublicError; end
|
||||
class TeamAlreadyExists < PublicError; end
|
||||
|
||||
# Makes public the status code but not anything about the response body.
|
||||
# The internal error message has the response body (truncated)
|
||||
|
@ -57,7 +58,8 @@ module MicrosoftSync
|
|||
def self.subclasses_by_status_code
|
||||
@subclasses_by_status_code ||= {
|
||||
400 => HTTPBadRequest,
|
||||
404 => HTTPNotFound
|
||||
404 => HTTPNotFound,
|
||||
409 => HTTPConflict,
|
||||
}
|
||||
end
|
||||
|
||||
|
@ -76,5 +78,6 @@ module MicrosoftSync
|
|||
|
||||
class HTTPNotFound < HTTPInvalidStatus; end
|
||||
class HTTPBadRequest < HTTPInvalidStatus; end
|
||||
class HTTPConflict < HTTPInvalidStatus; end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -118,11 +118,13 @@ module MicrosoftSync
|
|||
}
|
||||
request(:post, 'teams', body: body)
|
||||
rescue MicrosoftSync::Errors::HTTPBadRequest => e
|
||||
if e.response_body =~ /must have one or more owners in order to create a Team/
|
||||
raise MicrosoftSync::Errors::GroupHasNoOwners
|
||||
end
|
||||
raise unless e.response_body =~ /must have one or more owners in order to create a Team/i
|
||||
|
||||
raise
|
||||
raise MicrosoftSync::Errors::GroupHasNoOwners
|
||||
rescue MicrosoftSync::Errors::HTTPConflict => e
|
||||
raise unless e.response_body =~ /group is already provisioned/i
|
||||
|
||||
raise MicrosoftSync::Errors::TeamAlreadyExists
|
||||
end
|
||||
|
||||
# === Users ===
|
||||
|
|
|
@ -92,13 +92,22 @@ module MicrosoftSync
|
|||
# Signals that this step of the job succeeded.
|
||||
# memory_state is passed into the next step.
|
||||
class NextStep
|
||||
attr_reader :next_step, :memory_state
|
||||
def initialize(next_step, memory_state=nil)
|
||||
@next_step = next_step or raise InternalError
|
||||
attr_reader :step, :memory_state
|
||||
def initialize(step, memory_state=nil)
|
||||
@step = step or raise InternalError
|
||||
@memory_state = memory_state
|
||||
end
|
||||
end
|
||||
|
||||
class DelayedNextStep
|
||||
attr_reader :step, :delay_amount, :job_state_data
|
||||
def initialize(step, delay_amount, job_state_data=nil)
|
||||
@step = step or raise InternalError
|
||||
@delay_amount = delay_amount
|
||||
@job_state_data = job_state_data
|
||||
end
|
||||
end
|
||||
|
||||
class Complete; end
|
||||
|
||||
# Return this when your job is done:
|
||||
|
@ -152,7 +161,7 @@ module MicrosoftSync
|
|||
step_from_job_state = job_state&.dig(:step)
|
||||
|
||||
# Normal case: job continuation, or new job (step==nil) and no other job in-progress
|
||||
if step == step_from_job_state
|
||||
if step&.to_s == step_from_job_state&.to_s
|
||||
return run_main_loop(step, job_state&.dig(:data), synchronous)
|
||||
end
|
||||
|
||||
|
@ -208,8 +217,11 @@ module MicrosoftSync
|
|||
steps_object.after_complete
|
||||
return
|
||||
when NextStep
|
||||
current_step, memory_state = result.next_step, result.memory_state
|
||||
current_step, memory_state = result.step, result.memory_state
|
||||
job_state_data = nil
|
||||
when DelayedNextStep
|
||||
handle_delayed_next_step(result, synchronous)
|
||||
return
|
||||
when Retry
|
||||
handle_retry(result, current_step, synchronous)
|
||||
return
|
||||
|
@ -245,6 +257,25 @@ module MicrosoftSync
|
|||
steps_object.after_failure
|
||||
end
|
||||
|
||||
def update_state_record_to_retrying(new_job_state)
|
||||
job_state_record&.update_unless_deleted(
|
||||
workflow_state: :retrying,
|
||||
job_state: new_job_state.merge(updated_at: Time.zone.now)
|
||||
)
|
||||
end
|
||||
|
||||
def handle_delayed_next_step(delayed_next_step, synchronous)
|
||||
return unless update_state_record_to_retrying(
|
||||
step: delayed_next_step.step,
|
||||
data: delayed_next_step.job_state_data,
|
||||
retries_by_step: job_state_record.reload.job_state&.dig(:retries_by_step),
|
||||
)
|
||||
|
||||
run_with_delay(
|
||||
delayed_next_step.step, delayed_next_step.delay_amount, synchronous: synchronous
|
||||
)
|
||||
end
|
||||
|
||||
# Raises the error if we have passed the retry limit
|
||||
# Does nothing if workflow_state has since been set to deleted
|
||||
# Otherwise sets the job_state to keep track of (step, data, retries) and
|
||||
|
@ -262,17 +293,12 @@ module MicrosoftSync
|
|||
|
||||
retry_object.stash_block&.call
|
||||
|
||||
new_job_state = {
|
||||
return unless update_state_record_to_retrying(
|
||||
step: current_step,
|
||||
updated_at: Time.zone.now,
|
||||
data: retry_object.job_state_data,
|
||||
retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1),
|
||||
# for debugging only:
|
||||
retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}",
|
||||
}
|
||||
|
||||
return unless job_state_record&.update_unless_deleted(
|
||||
workflow_state: :retrying, job_state: new_job_state
|
||||
)
|
||||
|
||||
delay_amount = retry_object.delay_amount
|
||||
|
|
|
@ -97,11 +97,12 @@ module MicrosoftSync
|
|||
new_group_id = graph_service_helpers.create_education_class(course)['id']
|
||||
end
|
||||
|
||||
StateMachineJob::NextStep.new(:step_update_group_with_course_data, new_group_id)
|
||||
StateMachineJob::DelayedNextStep.new(
|
||||
:step_update_group_with_course_data, 2.seconds, new_group_id
|
||||
)
|
||||
end
|
||||
|
||||
def step_update_group_with_course_data(group_id_from_mem, group_id_from_state)
|
||||
group_id = group_id_from_mem || group_id_from_state
|
||||
def step_update_group_with_course_data(_mem_state, group_id)
|
||||
graph_service_helpers.update_group_with_course_data(group_id, course)
|
||||
group.update! ms_group_id: group_id
|
||||
StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
|
||||
|
@ -168,24 +169,31 @@ module MicrosoftSync
|
|||
graph_service.remove_group_owner(group.ms_group_id, aad)
|
||||
end
|
||||
|
||||
StateMachineJob::NextStep.new(:step_ensure_team_exists)
|
||||
StateMachineJob::NextStep.new(:step_check_team_exists)
|
||||
end
|
||||
|
||||
def step_ensure_team_exists(_mem_data, _job_state_data)
|
||||
def step_check_team_exists(_mem_data, _job_state_data)
|
||||
if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \
|
||||
&& !graph_service.team_exists?(group.ms_group_id)
|
||||
graph_service.create_education_class_team(group.ms_group_id)
|
||||
StateMachineJob::DelayedNextStep.new(:step_create_team, 10.seconds)
|
||||
else
|
||||
StateMachineJob::COMPLETE
|
||||
end
|
||||
end
|
||||
|
||||
def step_create_team(_mem_data, _job_state_data)
|
||||
graph_service.create_education_class_team(group.ms_group_id)
|
||||
StateMachineJob::COMPLETE
|
||||
rescue MicrosoftSync::Errors::TeamAlreadyExists
|
||||
StateMachineJob::COMPLETE
|
||||
rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e
|
||||
# API is eventually consistent: We often have to wait a couple minutes
|
||||
# after creating the group and adding owners for the Teams API to see the
|
||||
# group and owners.
|
||||
# It's also possible for the course to have added owners (so the
|
||||
# enrollments are in the DB) since we last calculated the diff added them
|
||||
# enrollments are in the DB) since we last calculated the diff and added them
|
||||
# in the generate_diff step. This is rare, but we can also sleep in that
|
||||
# case.
|
||||
# case. We'll eventually fail but the team will be created next time we sync.
|
||||
StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
|
||||
end
|
||||
|
||||
|
|
|
@ -59,17 +59,19 @@ describe MicrosoftSync::GraphService do
|
|||
end
|
||||
end
|
||||
|
||||
context 'with a non-200 status code' do
|
||||
let(:response) { json_response(403, error: {message: 'uh-oh!'}) }
|
||||
[400, 403, 409].each do |code|
|
||||
context "with a #{code} status code" do
|
||||
let(:response) { json_response(code, error: {message: 'uh-oh!'}) }
|
||||
|
||||
it 'raises an HTTPInvalidStatus with the code and message' do
|
||||
expect { subject }.to raise_error(
|
||||
MicrosoftSync::Errors::HTTPInvalidStatus,
|
||||
/Graph service returned 403 for tenant mytenant.*uh-oh!/
|
||||
/Graph service returned #{code} for tenant mytenant.*uh-oh!/
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
shared_examples_for 'a paginated list endpoint' do
|
||||
subject { service.send(method_name, *method_args) }
|
||||
|
@ -394,6 +396,19 @@ describe MicrosoftSync::GraphService do
|
|||
expect { subject }.to raise_error(MicrosoftSync::Errors::GroupHasNoOwners)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when Microsoft returns a 409 saying "group is already provisioned"' do
|
||||
let(:response) do
|
||||
{
|
||||
status: 409,
|
||||
body: "{\r\n \"error\": {\r\n \"code\": \"Conflict\",\r\n \"message\": \"Failed to execute Templates backend request CreateTeamFromGroupWithTemplateRequest. Request Url: https://teams.microsoft.com/fabric/amer/templates/api/groups/16786176-b111-1111-1111-111111111110/team, Request Method: PUT, Response Status Code: Conflict, Response Headers: Strict-Transport-Security: max-age=2592000\\r\\nx-operationid: 11111111111111111111111111111111\\r\\nx-telemetryid: 00-11111111111111111111111111111111-111111111111111b-00\\r\\nX-MSEdge-Ref: Ref A: 11111111111111111111111111111111 Ref B: BLUEDGE1111 Ref C: 2021-04-15T23:08:28Z\\r\\nDate: Thu, 15 Apr 2021 23:08:28 GMT\\r\\n, ErrorMessage : {\\\"errors\\\":[{\\\"message\\\":\\\"The group is already provisioned\\\",\\\"errorCode\\\":\\\"Unknown\\\"}],\\\"operationId\\\":\\\"11111111111111111111111111111111\\\"}\",\r\n \"innerError\": {\r\n \"date\": \"2021-04-15T23:08:28\",\r\n \"request-id\": \"11111111-1111-1111-1111-111111111111\",\r\n \"client-request-id\": \"11111111-1111-1111-1111-111111111111\"\r\n }\r\n }\r\n}"
|
||||
}
|
||||
end
|
||||
|
||||
it 'raises a TeamAlreadyExists error' do
|
||||
expect { subject }.to raise_error(MicrosoftSync::Errors::TeamAlreadyExists)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#list_users' do
|
||||
|
|
|
@ -267,6 +267,36 @@ module MicrosoftSync
|
|||
end
|
||||
end
|
||||
|
||||
context 'when the step returns a DelayedNextStep' do
|
||||
before do
|
||||
subject.send(:run, nil)
|
||||
allow(steps_object).to receive(:step_first).and_return \
|
||||
described_class::DelayedNextStep.new(:step_second, 1.minute, 'abc123')
|
||||
steps_object.steps_run.clear
|
||||
end
|
||||
|
||||
it 'enqueues a job for that step' do
|
||||
subject.send(:run, :step_first)
|
||||
expect(steps_object.steps_run).to eq([
|
||||
[:delay_run, [{run_at: 1.minute.from_now, strand: strand}], [:step_second]],
|
||||
])
|
||||
end
|
||||
|
||||
it 'leaves retries_by_step untouched' do
|
||||
expect { subject.send(:run, :step_first) }.not_to \
|
||||
change { state_record.reload.job_state[:retries_by_step] }.from('step_first' => 1)
|
||||
end
|
||||
|
||||
it 'sets job_state step, updated_at, and data' do
|
||||
Timecop.freeze(2.minutes.from_now) do
|
||||
expect { subject.send(:run, :step_first) }
|
||||
.to change { state_record.reload.job_state[:data] }.to('abc123')
|
||||
.and change { state_record.reload.job_state[:step] }.to(:step_second)
|
||||
.and change { state_record.reload.job_state[:updated_at] }.to(Time.zone.now)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'when an unhandled error occurs' do
|
||||
it 'bubbles up the error, sets the record state to errored, and calls after_failure' do
|
||||
subject.send(:run, nil)
|
||||
|
|
|
@ -30,13 +30,21 @@ describe MicrosoftSync::SyncerSteps do
|
|||
let(:tenant) { 'mytenant123' }
|
||||
let(:sync_enabled) { true }
|
||||
|
||||
def expect_next_step(result, next_step, memory_state=nil)
|
||||
def expect_next_step(result, step, memory_state=nil)
|
||||
expect(result).to be_a(MicrosoftSync::StateMachineJob::NextStep)
|
||||
expect { syncer_steps.method(next_step.to_sym) }.to_not raise_error
|
||||
expect(result.next_step).to eq(next_step)
|
||||
expect { syncer_steps.method(step.to_sym) }.to_not raise_error
|
||||
expect(result.step).to eq(step)
|
||||
expect(result.memory_state).to eq(memory_state)
|
||||
end
|
||||
|
||||
def expect_delayed_next_step(result, step, delay_amount, job_state_data=nil)
|
||||
expect(result).to be_a(MicrosoftSync::StateMachineJob::DelayedNextStep)
|
||||
expect { syncer_steps.method(step.to_sym) }.to_not raise_error
|
||||
expect(result.step).to eq(step)
|
||||
expect(result.delay_amount).to eq(delay_amount)
|
||||
expect(result.job_state_data).to eq(job_state_data)
|
||||
end
|
||||
|
||||
def expect_retry(result, error_class:, delay_amount: nil, job_state_data: nil)
|
||||
expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry)
|
||||
expect(result.error.class).to eq(error_class)
|
||||
|
@ -116,7 +124,9 @@ describe MicrosoftSync::SyncerSteps do
|
|||
expect(graph_service_helpers).to \
|
||||
receive(:create_education_class).with(course).and_return('id' => 'newid')
|
||||
|
||||
expect_next_step(subject, :step_update_group_with_course_data, 'newid')
|
||||
expect_delayed_next_step(
|
||||
subject, :step_update_group_with_course_data, 2.seconds, 'newid'
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -126,7 +136,9 @@ describe MicrosoftSync::SyncerSteps do
|
|||
it 'goes to the "update group" step with the remote group ID' do
|
||||
expect(graph_service_helpers).to_not receive(:create_education_class)
|
||||
|
||||
expect_next_step(subject, :step_update_group_with_course_data, 'newid2')
|
||||
expect_delayed_next_step(
|
||||
subject, :step_update_group_with_course_data, 2.seconds, 'newid2'
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -190,7 +202,10 @@ describe MicrosoftSync::SyncerSteps do
|
|||
end
|
||||
|
||||
describe '#step_update_group_with_course_data' do
|
||||
shared_examples_for 'updating group with course data' do
|
||||
subject do
|
||||
syncer_steps.step_update_group_with_course_data(nil, 'newid')
|
||||
end
|
||||
|
||||
context 'on success' do
|
||||
it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
|
||||
expect(graph_service_helpers).to \
|
||||
|
@ -224,21 +239,6 @@ describe MicrosoftSync::SyncerSteps do
|
|||
end
|
||||
end
|
||||
|
||||
context 'when run for the first time with a new_group_id' do
|
||||
subject { syncer_steps.step_update_group_with_course_data('newid', nil) }
|
||||
|
||||
it_behaves_like 'updating group with course data'
|
||||
end
|
||||
|
||||
context 'when retrying' do
|
||||
subject do
|
||||
syncer_steps.step_update_group_with_course_data(nil, 'newid')
|
||||
end
|
||||
|
||||
it_behaves_like 'updating group with course data'
|
||||
end
|
||||
end
|
||||
|
||||
describe '#step_ensure_enrollments_user_mappings_filled' do
|
||||
subject { syncer_steps.step_ensure_enrollments_user_mappings_filled(nil, nil) }
|
||||
|
||||
|
@ -370,7 +370,7 @@ describe MicrosoftSync::SyncerSteps do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#execute_diff' do
|
||||
describe '#step_execute_diff' do
|
||||
subject { syncer_steps.step_execute_diff(diff, nil) }
|
||||
|
||||
let(:diff) { double('MembershipDiff') }
|
||||
|
@ -394,12 +394,12 @@ describe MicrosoftSync::SyncerSteps do
|
|||
expect(graph_service).to \
|
||||
receive(:add_users_to_group).with('mygroup', members: %w[o3])
|
||||
|
||||
expect_next_step(subject, :step_ensure_team_exists)
|
||||
expect_next_step(subject, :step_check_team_exists)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#ensure_team_exists' do
|
||||
subject { syncer_steps.step_ensure_team_exists(nil, nil) }
|
||||
describe '#step_check_team_exists' do
|
||||
subject { syncer_steps.step_check_team_exists(nil, nil) }
|
||||
|
||||
before { group.update!(ms_group_id: 'mygroupid') }
|
||||
|
||||
|
@ -409,23 +409,29 @@ describe MicrosoftSync::SyncerSteps do
|
|||
e.destroy if e.type =~ /^Teacher|Ta|Designer/
|
||||
end
|
||||
expect(graph_service).to_not receive(:team_exists?)
|
||||
expect(graph_service).to_not receive(:create_team)
|
||||
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when the team already exists' do
|
||||
it "doesn't create a team" do
|
||||
it "returns COMPLETE" do
|
||||
expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(true)
|
||||
expect(graph_service).to_not receive(:create_team)
|
||||
expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
|
||||
end
|
||||
end
|
||||
|
||||
context "when the team doesn't exist" do
|
||||
before do
|
||||
allow(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
|
||||
it "moves on to step_create_team after a delay" do
|
||||
expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
|
||||
expect_delayed_next_step(subject, :step_create_team, 10.seconds)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#step_create_team' do
|
||||
subject { syncer_steps.step_create_team(nil, nil) }
|
||||
|
||||
before { group.update!(ms_group_id: 'mygroupid') }
|
||||
|
||||
it 'creates the team' do
|
||||
expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
|
||||
|
@ -434,8 +440,8 @@ describe MicrosoftSync::SyncerSteps do
|
|||
|
||||
context 'when the Microsoft API errors with "group has no owners"' do
|
||||
it "retries in (30, 90, 270 seconds)" do
|
||||
expect(graph_service).to receive(:create_education_class_team).with('mygroupid').
|
||||
and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
|
||||
expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
|
||||
.and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
|
||||
expect_retry(
|
||||
subject,
|
||||
error_class: MicrosoftSync::Errors::GroupHasNoOwners,
|
||||
|
@ -459,11 +465,10 @@ describe MicrosoftSync::SyncerSteps do
|
|||
context 'when the Microsoft API errors with some other error' do
|
||||
it "bubbles up the error" do
|
||||
expect(graph_service).to \
|
||||
receive(:create_education_class_team).with('mygroupid').
|
||||
and_raise(new_http_error(400))
|
||||
receive(:create_education_class_team).with('mygroupid')
|
||||
.and_raise(new_http_error(400))
|
||||
expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue