MSFT Sync: wait to run steps likely to fail

* Don't try to update group properties immediately after creation, since it is likely to fail until the Microsoft API settles * Don't try to create a team the immediately after group creation, since it is also likely to fail until the Microsoft API creates refs INTEROP-6638 flag=microsoft_group_enrollments_syncing Test plan: - Delete existing groups on Microsoft side and run a syncer job. Watch logs to see that: 1. it enqueues a job for step_ensure_class_group_exists (delay of 2 seconds) before ever hitting the PATCH endpoint (you can see endpoints hit with logs looking like "MicrosoftSync::GraphClient: get ..."), and that when it does so, it updates the group to have workflow_state=retrying and job_state with empty retries_by_step 2. it enqueues a job for step_create_team (delay of 10 seconds) before ever hitting the endpoint to create the team (post https://graph.microsoft.com/v1.0/teams) - After it's done, check that the job completed successfully and a group and team were created on the Microsoft side. - From a console run group.syncer_job.steps_object.step_check_team_exists(nil, nil) again and make sure it returns COMPLETE. - From a console run group.syncer_job.steps_object.step_create_team(nil, nil) again and make sure it returns COMPLETE. - For completeness, run a sync job on the same group again to make sure a sync after completion still works. This time, step_update_group_with_course_data and step_create_team should never run. If you want, you run the job synchronously to see the steps easier. Change-Id: Ic2b697dd7cc0d1bbf75ad55d3593ee64975e3ab9 Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/263012 Product-Review: Evan Battaglia <ebattaglia@instructure.com> Reviewed-by: Wagner Goncalves <wagner.goncalves@instructure.com> QA-Review: Wagner Goncalves <wagner.goncalves@instructure.com> Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
2021-04-15 14:27:32 -04:00 · 2021-04-15 14:27:32 -04:00 · 0baf7b10ab
parent 4df7b6b1e9
commit 0baf7b10ab
7 changed files with 203 additions and 114 deletions
--- a/lib/microsoft_sync/errors.rb
+++ b/lib/microsoft_sync/errors.rb
@ -44,6 +44,7 @@ module MicrosoftSync

    class InvalidRemoteState < PublicError; end
    class GroupHasNoOwners < PublicError; end
+    class TeamAlreadyExists < PublicError; end

    # Makes public the status code but not anything about the response body.
    # The internal error message has the response body (truncated)
@ -57,7 +58,8 @@ module MicrosoftSync
      def self.subclasses_by_status_code
        @subclasses_by_status_code ||= {
          400 => HTTPBadRequest,
-          404 => HTTPNotFound
+          404 => HTTPNotFound,
+          409 => HTTPConflict,
        }
      end

@ -76,5 +78,6 @@ module MicrosoftSync

    class HTTPNotFound < HTTPInvalidStatus; end
    class HTTPBadRequest < HTTPInvalidStatus; end
+    class HTTPConflict < HTTPInvalidStatus; end
  end
 end
--- a/lib/microsoft_sync/graph_service.rb
+++ b/lib/microsoft_sync/graph_service.rb
@ -118,11 +118,13 @@ module MicrosoftSync
      }
      request(:post, 'teams', body: body)
    rescue MicrosoftSync::Errors::HTTPBadRequest => e
-      if e.response_body =~ /must have one or more owners in order to create a Team/
-        raise MicrosoftSync::Errors::GroupHasNoOwners
-      end
+      raise unless e.response_body =~ /must have one or more owners in order to create a Team/i

-      raise
+      raise MicrosoftSync::Errors::GroupHasNoOwners
+    rescue MicrosoftSync::Errors::HTTPConflict => e
+      raise unless e.response_body =~ /group is already provisioned/i
+
+      raise MicrosoftSync::Errors::TeamAlreadyExists
    end

    # === Users ===
--- a/lib/microsoft_sync/state_machine_job.rb
+++ b/lib/microsoft_sync/state_machine_job.rb
@ -92,13 +92,22 @@ module MicrosoftSync
    # Signals that this step of the job succeeded.
    # memory_state is passed into the next step.
    class NextStep
-      attr_reader :next_step, :memory_state
-      def initialize(next_step, memory_state=nil)
-        @next_step = next_step or raise InternalError
+      attr_reader :step, :memory_state
+      def initialize(step, memory_state=nil)
+        @step = step or raise InternalError
        @memory_state = memory_state
      end
    end

+    class DelayedNextStep
+      attr_reader :step, :delay_amount, :job_state_data
+      def initialize(step, delay_amount, job_state_data=nil)
+        @step = step or raise InternalError
+        @delay_amount = delay_amount
+        @job_state_data = job_state_data
+      end
+    end
+
    class Complete; end

    # Return this when your job is done:
@ -152,7 +161,7 @@ module MicrosoftSync
      step_from_job_state = job_state&.dig(:step)

      # Normal case: job continuation, or new job (step==nil) and no other job in-progress
-      if step == step_from_job_state
+      if step&.to_s == step_from_job_state&.to_s
        return run_main_loop(step, job_state&.dig(:data), synchronous)
      end

@ -208,8 +217,11 @@ module MicrosoftSync
          steps_object.after_complete
          return
        when NextStep
-          current_step, memory_state = result.next_step, result.memory_state
+          current_step, memory_state = result.step, result.memory_state
          job_state_data = nil
+        when DelayedNextStep
+          handle_delayed_next_step(result, synchronous)
+          return
        when Retry
          handle_retry(result, current_step, synchronous)
          return
@ -245,6 +257,25 @@ module MicrosoftSync
      steps_object.after_failure
    end

+    def update_state_record_to_retrying(new_job_state)
+      job_state_record&.update_unless_deleted(
+        workflow_state: :retrying,
+        job_state: new_job_state.merge(updated_at: Time.zone.now)
+      )
+    end
+
+    def handle_delayed_next_step(delayed_next_step, synchronous)
+      return unless update_state_record_to_retrying(
+        step: delayed_next_step.step,
+        data: delayed_next_step.job_state_data,
+        retries_by_step: job_state_record.reload.job_state&.dig(:retries_by_step),
+      )
+
+      run_with_delay(
+        delayed_next_step.step, delayed_next_step.delay_amount, synchronous: synchronous
+      )
+    end
+
    # Raises the error if we have passed the retry limit
    # Does nothing if workflow_state has since been set to deleted
    # Otherwise sets the job_state to keep track of (step, data, retries) and
@ -262,17 +293,12 @@ module MicrosoftSync

      retry_object.stash_block&.call

-      new_job_state = {
+      return unless update_state_record_to_retrying(
        step: current_step,
-        updated_at: Time.zone.now,
        data: retry_object.job_state_data,
        retries_by_step: retries_by_step.merge(current_step.to_s => retries + 1),
        # for debugging only:
        retried_on_error: "#{retry_object.error.class}: #{retry_object.error.message}",
-      }
-
-      return unless job_state_record&.update_unless_deleted(
-        workflow_state: :retrying, job_state: new_job_state
      )

      delay_amount = retry_object.delay_amount
--- a/lib/microsoft_sync/syncer_steps.rb
+++ b/lib/microsoft_sync/syncer_steps.rb
@ -97,11 +97,12 @@ module MicrosoftSync
        new_group_id = graph_service_helpers.create_education_class(course)['id']
      end

-      StateMachineJob::NextStep.new(:step_update_group_with_course_data, new_group_id)
+      StateMachineJob::DelayedNextStep.new(
+        :step_update_group_with_course_data, 2.seconds, new_group_id
+      )
    end

-    def step_update_group_with_course_data(group_id_from_mem, group_id_from_state)
-      group_id = group_id_from_mem || group_id_from_state
+    def step_update_group_with_course_data(_mem_state, group_id)
      graph_service_helpers.update_group_with_course_data(group_id, course)
      group.update! ms_group_id: group_id
      StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
@ -168,24 +169,31 @@ module MicrosoftSync
        graph_service.remove_group_owner(group.ms_group_id, aad)
      end

-      StateMachineJob::NextStep.new(:step_ensure_team_exists)
+      StateMachineJob::NextStep.new(:step_check_team_exists)
    end

-    def step_ensure_team_exists(_mem_data, _job_state_data)
+    def step_check_team_exists(_mem_data, _job_state_data)
      if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \
        && !graph_service.team_exists?(group.ms_group_id)
-        graph_service.create_education_class_team(group.ms_group_id)
+        StateMachineJob::DelayedNextStep.new(:step_create_team, 10.seconds)
+      else
+        StateMachineJob::COMPLETE
      end
+    end

+    def step_create_team(_mem_data, _job_state_data)
+      graph_service.create_education_class_team(group.ms_group_id)
+      StateMachineJob::COMPLETE
+    rescue MicrosoftSync::Errors::TeamAlreadyExists
      StateMachineJob::COMPLETE
    rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e
      # API is eventually consistent: We often have to wait a couple minutes
      # after creating the group and adding owners for the Teams API to see the
      # group and owners.
      # It's also possible for the course to have added owners (so the
-      # enrollments are in the DB) since we last calculated the diff added them
+      # enrollments are in the DB) since we last calculated the diff and added them
      # in the generate_diff step. This is rare, but we can also sleep in that
-      # case.
+      # case. We'll eventually fail but the team will be created next time we sync.
      StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
    end

--- a/spec/lib/microsoft_sync/graph_service_spec.rb
+++ b/spec/lib/microsoft_sync/graph_service_spec.rb
@ -59,14 +59,16 @@ describe MicrosoftSync::GraphService do
      end
    end

-    context 'with a non-200 status code' do
-      let(:response) { json_response(403, error: {message: 'uh-oh!'}) }
+    [400, 403, 409].each do |code|
+      context "with a #{code} status code" do
+        let(:response) { json_response(code, error: {message: 'uh-oh!'}) }

-      it 'raises an HTTPInvalidStatus with the code and message' do
-        expect { subject }.to raise_error(
-          MicrosoftSync::Errors::HTTPInvalidStatus,
-          /Graph service returned 403 for tenant mytenant.*uh-oh!/
-        )
+        it 'raises an HTTPInvalidStatus with the code and message' do
+          expect { subject }.to raise_error(
+            MicrosoftSync::Errors::HTTPInvalidStatus,
+            /Graph service returned #{code} for tenant mytenant.*uh-oh!/
+          )
+        end
      end
    end
  end
@ -394,6 +396,19 @@ describe MicrosoftSync::GraphService do
        expect { subject }.to raise_error(MicrosoftSync::Errors::GroupHasNoOwners)
      end
    end
+
+    context 'when Microsoft returns a 409 saying "group is already provisioned"' do
+      let(:response) do
+        {
+          status: 409,
+          body: "{\r\n  \"error\": {\r\n    \"code\": \"Conflict\",\r\n    \"message\": \"Failed to execute Templates backend request CreateTeamFromGroupWithTemplateRequest. Request Url: https://teams.microsoft.com/fabric/amer/templates/api/groups/16786176-b111-1111-1111-111111111110/team, Request Method: PUT, Response Status Code: Conflict, Response Headers: Strict-Transport-Security: max-age=2592000\\r\\nx-operationid: 11111111111111111111111111111111\\r\\nx-telemetryid: 00-11111111111111111111111111111111-111111111111111b-00\\r\\nX-MSEdge-Ref: Ref A: 11111111111111111111111111111111 Ref B: BLUEDGE1111 Ref C: 2021-04-15T23:08:28Z\\r\\nDate: Thu, 15 Apr 2021 23:08:28 GMT\\r\\n, ErrorMessage : {\\\"errors\\\":[{\\\"message\\\":\\\"The group is already provisioned\\\",\\\"errorCode\\\":\\\"Unknown\\\"}],\\\"operationId\\\":\\\"11111111111111111111111111111111\\\"}\",\r\n    \"innerError\": {\r\n      \"date\": \"2021-04-15T23:08:28\",\r\n      \"request-id\": \"11111111-1111-1111-1111-111111111111\",\r\n      \"client-request-id\": \"11111111-1111-1111-1111-111111111111\"\r\n    }\r\n  }\r\n}"
+        }
+      end
+
+      it 'raises a TeamAlreadyExists error' do
+        expect { subject }.to raise_error(MicrosoftSync::Errors::TeamAlreadyExists)
+      end
+    end
  end

  describe '#list_users' do
--- a/spec/lib/microsoft_sync/state_machine_job_spec.rb
+++ b/spec/lib/microsoft_sync/state_machine_job_spec.rb
@ -267,6 +267,36 @@ module MicrosoftSync
        end
      end

+      context 'when the step returns a DelayedNextStep' do
+        before do
+          subject.send(:run, nil)
+          allow(steps_object).to receive(:step_first).and_return \
+            described_class::DelayedNextStep.new(:step_second, 1.minute, 'abc123')
+          steps_object.steps_run.clear
+        end
+
+        it 'enqueues a job for that step' do
+          subject.send(:run, :step_first)
+          expect(steps_object.steps_run).to eq([
+            [:delay_run, [{run_at: 1.minute.from_now, strand: strand}], [:step_second]],
+          ])
+        end
+
+        it 'leaves retries_by_step untouched' do
+          expect { subject.send(:run, :step_first) }.not_to \
+            change { state_record.reload.job_state[:retries_by_step] }.from('step_first' => 1)
+        end
+
+        it 'sets job_state step, updated_at, and data' do
+          Timecop.freeze(2.minutes.from_now) do
+            expect { subject.send(:run, :step_first) }
+              .to change { state_record.reload.job_state[:data] }.to('abc123')
+              .and change { state_record.reload.job_state[:step] }.to(:step_second)
+              .and change { state_record.reload.job_state[:updated_at] }.to(Time.zone.now)
+          end
+        end
+      end
+
      context 'when an unhandled error occurs' do
        it 'bubbles up the error, sets the record state to errored, and calls after_failure' do
          subject.send(:run, nil)
--- a/spec/lib/microsoft_sync/syncer_steps_spec.rb
+++ b/spec/lib/microsoft_sync/syncer_steps_spec.rb
@ -30,13 +30,21 @@ describe MicrosoftSync::SyncerSteps do
  let(:tenant) { 'mytenant123' }
  let(:sync_enabled) { true }

-  def expect_next_step(result, next_step, memory_state=nil)
+  def expect_next_step(result, step, memory_state=nil)
    expect(result).to be_a(MicrosoftSync::StateMachineJob::NextStep)
-    expect { syncer_steps.method(next_step.to_sym) }.to_not raise_error
-    expect(result.next_step).to eq(next_step)
+    expect { syncer_steps.method(step.to_sym) }.to_not raise_error
+    expect(result.step).to eq(step)
    expect(result.memory_state).to eq(memory_state)
  end

+  def expect_delayed_next_step(result, step, delay_amount, job_state_data=nil)
+    expect(result).to be_a(MicrosoftSync::StateMachineJob::DelayedNextStep)
+    expect { syncer_steps.method(step.to_sym) }.to_not raise_error
+    expect(result.step).to eq(step)
+    expect(result.delay_amount).to eq(delay_amount)
+    expect(result.job_state_data).to eq(job_state_data)
+  end
+
  def expect_retry(result, error_class:, delay_amount: nil, job_state_data: nil)
    expect(result).to be_a(MicrosoftSync::StateMachineJob::Retry)
    expect(result.error.class).to eq(error_class)
@ -116,7 +124,9 @@ describe MicrosoftSync::SyncerSteps do
          expect(graph_service_helpers).to \
            receive(:create_education_class).with(course).and_return('id' => 'newid')

-          expect_next_step(subject, :step_update_group_with_course_data, 'newid')
+          expect_delayed_next_step(
+            subject, :step_update_group_with_course_data, 2.seconds, 'newid'
+          )
        end
      end

@ -126,7 +136,9 @@ describe MicrosoftSync::SyncerSteps do
        it 'goes to the "update group" step with the remote group ID' do
          expect(graph_service_helpers).to_not receive(:create_education_class)

-          expect_next_step(subject, :step_update_group_with_course_data, 'newid2')
+          expect_delayed_next_step(
+            subject, :step_update_group_with_course_data, 2.seconds, 'newid2'
+          )
        end
      end

@ -190,52 +202,40 @@ describe MicrosoftSync::SyncerSteps do
  end

  describe '#step_update_group_with_course_data' do
-    shared_examples_for 'updating group with course data' do
-      context 'on success' do
-        it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
-          expect(graph_service_helpers).to \
-            receive(:update_group_with_course_data).with('newid', course)
+    subject do
+      syncer_steps.step_update_group_with_course_data(nil, 'newid')
+    end

-          expect { subject }.to change { group.reload.ms_group_id }.to('newid')
-          expect_next_step(subject, :step_ensure_enrollments_user_mappings_filled)
-        end
-      end
+    context 'on success' do
+      it 'updates the LMS metadata, sets ms_group_id, and goes to the next step' do
+        expect(graph_service_helpers).to \
+          receive(:update_group_with_course_data).with('newid', course)

-      context 'on 404' do
-        it 'retries with a delay' do
-          expect(graph_service_helpers).to \
-            receive(:update_group_with_course_data).with('newid', course)
-            .and_raise(new_http_error(404))
-          expect { subject }.to_not change { group.reload.ms_group_id }
-          expect_retry(
-            subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
-            delay_amount: [5, 20, 100], job_state_data: 'newid'
-          )
-        end
-      end
-
-      context 'on other failure' do
-        it 'bubbles up the error' do
-          expect(graph_service_helpers).to \
-            receive(:update_group_with_course_data).with('newid', course)
-            .and_raise(new_http_error(400))
-          expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
-        end
+        expect { subject }.to change { group.reload.ms_group_id }.to('newid')
+        expect_next_step(subject, :step_ensure_enrollments_user_mappings_filled)
      end
    end

-    context 'when run for the first time with a new_group_id' do
-      subject { syncer_steps.step_update_group_with_course_data('newid', nil) }
-
-      it_behaves_like 'updating group with course data'
+    context 'on 404' do
+      it 'retries with a delay' do
+        expect(graph_service_helpers).to \
+          receive(:update_group_with_course_data).with('newid', course)
+          .and_raise(new_http_error(404))
+        expect { subject }.to_not change { group.reload.ms_group_id }
+        expect_retry(
+          subject, error_class: MicrosoftSync::Errors::HTTPNotFound,
+          delay_amount: [5, 20, 100], job_state_data: 'newid'
+        )
+      end
    end

-    context 'when retrying' do
-      subject do
-        syncer_steps.step_update_group_with_course_data(nil, 'newid')
+    context 'on other failure' do
+      it 'bubbles up the error' do
+        expect(graph_service_helpers).to \
+          receive(:update_group_with_course_data).with('newid', course)
+          .and_raise(new_http_error(400))
+        expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
      end
-
-      it_behaves_like 'updating group with course data'
    end
  end

@ -370,7 +370,7 @@ describe MicrosoftSync::SyncerSteps do
    end
  end

-  describe '#execute_diff' do
+  describe '#step_execute_diff' do
    subject { syncer_steps.step_execute_diff(diff, nil) }

    let(:diff) { double('MembershipDiff') }
@ -394,12 +394,12 @@ describe MicrosoftSync::SyncerSteps do
      expect(graph_service).to \
        receive(:add_users_to_group).with('mygroup', members: %w[o3])

-      expect_next_step(subject, :step_ensure_team_exists)
+      expect_next_step(subject, :step_check_team_exists)
    end
  end

-  describe '#ensure_team_exists' do
-    subject { syncer_steps.step_ensure_team_exists(nil, nil) }
+  describe '#step_check_team_exists' do
+    subject { syncer_steps.step_check_team_exists(nil, nil) }

    before { group.update!(ms_group_id: 'mygroupid') }

@ -409,60 +409,65 @@ describe MicrosoftSync::SyncerSteps do
          e.destroy if e.type =~ /^Teacher|Ta|Designer/
        end
        expect(graph_service).to_not receive(:team_exists?)
-        expect(graph_service).to_not receive(:create_team)
        expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
      end
    end

    context 'when the team already exists' do
-      it "doesn't create a team" do
+      it "returns COMPLETE" do
        expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(true)
-        expect(graph_service).to_not receive(:create_team)
        expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
      end
    end

    context "when the team doesn't exist" do
-      before do
-        allow(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
+      it "moves on to step_create_team after a delay" do
+        expect(graph_service).to receive(:team_exists?).with('mygroupid').and_return(false)
+        expect_delayed_next_step(subject, :step_create_team, 10.seconds)
      end
+    end
+  end

-      it 'creates the team' do
+  describe '#step_create_team' do
+    subject { syncer_steps.step_create_team(nil, nil) }
+
+    before { group.update!(ms_group_id: 'mygroupid') }
+
+    it 'creates the team' do
+      expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
+      expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
+    end
+
+    context 'when the Microsoft API errors with "group has no owners"' do
+      it "retries in (30, 90, 270 seconds)" do
        expect(graph_service).to receive(:create_education_class_team).with('mygroupid')
-        expect(subject).to eq(MicrosoftSync::StateMachineJob::COMPLETE)
+          .and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
+        expect_retry(
+          subject,
+          error_class: MicrosoftSync::Errors::GroupHasNoOwners,
+          delay_amount: [30, 90, 270]
+        )
      end
+    end

-      context 'when the Microsoft API errors with "group has no owners"' do
-        it "retries in (30, 90, 270 seconds)" do
-          expect(graph_service).to receive(:create_education_class_team).with('mygroupid').
-            and_raise(MicrosoftSync::Errors::GroupHasNoOwners)
-          expect_retry(
-            subject,
-            error_class: MicrosoftSync::Errors::GroupHasNoOwners,
-            delay_amount: [30, 90, 270]
-          )
-        end
+    context "when the Microsoft API errors with a 404 (e.g., group doesn't exist)" do
+      it "retries in (30, 90, 270 seconds)" do
+        expect(graph_service).to \
+          receive(:create_education_class_team).with('mygroupid').and_raise(new_http_error(404))
+        expect_retry(
+          subject,
+          error_class: MicrosoftSync::Errors::HTTPNotFound,
+          delay_amount: [30, 90, 270]
+        )
      end
+    end

-      context "when the Microsoft API errors with a 404 (e.g., group doesn't exist)" do
-        it "retries in (30, 90, 270 seconds)" do
-          expect(graph_service).to \
-            receive(:create_education_class_team).with('mygroupid').and_raise(new_http_error(404))
-          expect_retry(
-            subject,
-            error_class: MicrosoftSync::Errors::HTTPNotFound,
-            delay_amount: [30, 90, 270]
-          )
-        end
-      end
-
-      context 'when the Microsoft API errors with some other error' do
-        it "bubbles up the error" do
-          expect(graph_service).to \
-            receive(:create_education_class_team).with('mygroupid').
-            and_raise(new_http_error(400))
-          expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
-        end
+    context 'when the Microsoft API errors with some other error' do
+      it "bubbles up the error" do
+        expect(graph_service).to \
+          receive(:create_education_class_team).with('mygroupid')
+          .and_raise(new_http_error(400))
+        expect { subject }.to raise_error(MicrosoftSync::Errors::HTTPBadRequest)
      end
    end
  end