460 lines
19 KiB
Ruby
460 lines
19 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
#
|
|
# Copyright (C) 2021 - present Instructure, Inc.
|
|
#
|
|
# This file is part of Canvas.
|
|
#
|
|
# Canvas is free software: you can redistribute it and/or modify it under
|
|
# the terms of the GNU Affero General Public License as published by the Free
|
|
# Software Foundation, version 3 of the License.
|
|
#
|
|
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
# details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License along
|
|
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
#
|
|
# Code which syncs course enrollments to Microsoft groups/teams. This is a good
|
|
# starting place to understand the syncing process.
|
|
# See also MicrosoftSync::Group model.
|
|
#
|
|
# This ideally shouldn't contain much job plumbing (retry logic, strands, state
|
|
# storage, etc.), but focus on the business logic about what to do in each step
|
|
# of a sync. For job plumbing, see StateMachineJob. This should normally be
|
|
# used by creating a StateMachineJob with this as the steps_object; see
|
|
# MicrosoftSync::Group#syncer_job
|
|
# group.syncer_job.run_later
|
|
# group.syncer_job.run_synchronously # e.g. manually in a console
|
|
#
|
|
module MicrosoftSync
|
|
class SyncerSteps
|
|
# Database batch size for users without AAD ids. Should be an even multiple of
|
|
# GraphServiceHelpers::USERS_ULUVS_TO_AADS_BATCH_SIZE:
|
|
ENROLLMENTS_ULUV_FETCHING_BATCH_SIZE = 750
|
|
|
|
# Delays for intermittent errors and to allow Microsoft's
|
|
# eventually-consistent API time to settle:
|
|
STANDARD_RETRY_DELAY = [15, 60, 300].freeze
|
|
DELAY_BEFORE_UPDATE_GROUP = 8.seconds
|
|
DELAY_BEFORE_CREATE_TEAM = 24.seconds
|
|
|
|
# The more changes, the more likely it is that some of the changes will
|
|
# be duplicative and cause retries when adding/removing, which makes
|
|
# partial sync less efficient, and it makes sense to just do a full sync.
|
|
# This number is fairly arbitrary but it should be small enough that
|
|
# we can get all the enrollments from the DB in one fetch. Also, given a
|
|
# max course size of 25k, this means that in the worst case, triggering a
|
|
# full sync would be (25k/20) extra requests to get the current user list,
|
|
# or about 3 extra requests per user added... not too bad.
|
|
MAX_PARTIAL_SYNC_CHANGES = 500
|
|
|
|
STATSD_NAME = 'microsoft_sync.syncer_steps'
|
|
STATSD_NAME_SKIPPED_BATCHES = "#{STATSD_NAME}.skipped_batches"
|
|
STATSD_NAME_SKIPPED_TOTAL = "#{STATSD_NAME}.skipped_total"
|
|
|
|
# Can happen when User disables sync on account-level when jobs are running:
|
|
class TenantMissingOrSyncDisabled < Errors::GracefulCancelError
|
|
def self.public_message
|
|
I18n.t 'Tenant missing or sync disabled. ' \
|
|
'Check the Microsoft sync integration settings for the course and account.'
|
|
end
|
|
end
|
|
|
|
class MultipleEducationClasses < Errors::GracefulCancelError
|
|
def self.public_message
|
|
I18n.t 'Multiple Microsoft education classes already exist for the course.'
|
|
end
|
|
end
|
|
|
|
class MaxMemberEnrollmentsReached < Errors::GracefulCancelError
|
|
def self.public_message
|
|
I18n.t 'Microsoft 365 allows a maximum of %{max} members in a team.'
|
|
end
|
|
|
|
def public_interpolated_values
|
|
{ max: MicrosoftSync::MembershipDiff::MAX_ENROLLMENT_MEMBERS }
|
|
end
|
|
end
|
|
|
|
class MaxOwnerEnrollmentsReached < Errors::GracefulCancelError
|
|
def self.public_message
|
|
I18n.t 'Microsoft 365 allows a maximum of %{max} owners in a team.'
|
|
end
|
|
|
|
def public_interpolated_values
|
|
{ max: MicrosoftSync::MembershipDiff::MAX_ENROLLMENT_OWNERS }
|
|
end
|
|
end
|
|
|
|
attr_reader :group
|
|
|
|
delegate :course, to: :group
|
|
|
|
def initialize(group)
|
|
@group = group
|
|
end
|
|
|
|
def max_retries
|
|
3
|
|
end
|
|
|
|
def max_delay
|
|
6.hours
|
|
end
|
|
|
|
def after_failure
|
|
# We can clean up here e.g. (MicrosoftSync::GroupMember.delete_all)
|
|
# when we have retry in getting owners & executing diff
|
|
end
|
|
|
|
def after_complete
|
|
group.update!(last_synced_at: Time.zone.now)
|
|
end
|
|
|
|
def retry_object_for_error(e, **extra_args)
|
|
delay_amount = e.retry_after_seconds if e.is_a?(Errors::Throttled)
|
|
delay_amount ||= STANDARD_RETRY_DELAY
|
|
StateMachineJob::Retry.new(error: e, delay_amount: delay_amount, **extra_args)
|
|
end
|
|
|
|
def step_initial(job_type, _job_state_data)
|
|
StateMachineJob::NextStep.new(
|
|
job_type.to_s == 'partial' ? :step_partial_sync : :step_full_sync_prerequisites
|
|
)
|
|
end
|
|
|
|
# Checks if there are too many enrollments to sync, and if, so aborts.
|
|
# Also deletes PartialSyncChanges that are unnecessary now that we will
|
|
# be processing all enrollments. (Deletes only records that have been
|
|
# replicated to the secondary. We read Enrollments from the secondary so we
|
|
# don't want to delete PartialSyncChanges corresponding to enrollments not
|
|
# yet replicated.)
|
|
def step_full_sync_prerequisites(_mem_data, _job_state_data)
|
|
if CanvasModelsHelpers.max_enrollment_members_reached?(course)
|
|
raise_and_disable_group(MaxMemberEnrollmentsReached)
|
|
elsif CanvasModelsHelpers.max_enrollment_owners_reached?(course)
|
|
raise_and_disable_group(MaxOwnerEnrollmentsReached)
|
|
end
|
|
|
|
PartialSyncChange.delete_all_replicated_to_secondary_for_course(course.id)
|
|
|
|
StateMachineJob::NextStep.new(:step_ensure_class_group_exists)
|
|
end
|
|
|
|
# Second step of a full sync. Create group on the Microsoft side.
|
|
def step_ensure_class_group_exists(_mem_data, _job_state_data)
|
|
remote_ids = graph_service_helpers.list_education_classes_for_course(course).map { |c| c['id'] }
|
|
|
|
# If we've created the group previously, we're good to go
|
|
if group.ms_group_id && remote_ids == [group.ms_group_id]
|
|
return StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
|
|
end
|
|
|
|
raise MultipleEducationClasses if remote_ids.length > 1
|
|
|
|
# Create a group if needed. If there is already a group but we do not
|
|
# have it in the Group record, use it but first update it with course
|
|
# data in case it was never done.
|
|
new_group_id = remote_ids.first
|
|
|
|
unless new_group_id
|
|
new_group_id = graph_service_helpers.create_education_class(course)['id']
|
|
end
|
|
|
|
StateMachineJob::DelayedNextStep.new(
|
|
:step_update_group_with_course_data, DELAY_BEFORE_UPDATE_GROUP, new_group_id
|
|
)
|
|
rescue *Errors::INTERMITTENT => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
def step_update_group_with_course_data(_mem_state, group_id)
|
|
graph_service_helpers.update_group_with_course_data(group_id, course)
|
|
group.update! ms_group_id: group_id
|
|
StateMachineJob::NextStep.new(:step_ensure_enrollments_user_mappings_filled)
|
|
rescue *Errors::INTERMITTENT_AND_NOTFOUND => e
|
|
retry_object_for_error(e, job_state_data: group_id)
|
|
end
|
|
|
|
# Gets users enrolled in course, get ULUVs (user lookup values, e.g.
|
|
# addresses, username) for them, looks up the AADs (Azure Active Directory
|
|
# object IDs -- Microsoft's internal ID for the user) from Microsoft, and
|
|
# writes the User->AAD mapping into the UserMapping table. If a user
|
|
# doesn't have whatever we use to bulid the ULUV (e.g. email or SIS id, as
|
|
# specified by the microsoft_sync_login_attribute Account setting), or
|
|
# Microsoft doesn't have a user for the calculated ULUV, skips that user.
|
|
def step_ensure_enrollments_user_mappings_filled(_mem_data, _job_state_data)
|
|
MicrosoftSync::UserMapping.find_enrolled_user_ids_without_mappings(
|
|
course: course, batch_size: ENROLLMENTS_ULUV_FETCHING_BATCH_SIZE
|
|
) do |user_ids|
|
|
ensure_user_mappings(user_ids)
|
|
end
|
|
|
|
StateMachineJob::NextStep.new(:step_generate_diff)
|
|
rescue *Errors::INTERMITTENT_AND_NOTFOUND => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
def ensure_user_mappings(user_ids)
|
|
users_uluvs_finder = MicrosoftSync::UsersUluvsFinder.new(user_ids, group.root_account)
|
|
users_and_uluvs = users_uluvs_finder.call
|
|
remote_attr = account_settings[:microsoft_sync_remote_attribute]
|
|
|
|
# If some users in different slices have the same ULUVs, this could end up
|
|
# looking up the same ULUV multiple times; but this should be very rare
|
|
users_and_uluvs.each_slice(GraphServiceHelpers::USERS_ULUVS_TO_AADS_BATCH_SIZE) do |slice|
|
|
uluv_to_aad = graph_service_helpers.users_uluvs_to_aads(remote_attr, slice.map(&:last))
|
|
user_id_to_aad = slice.map { |user_id, uluv| [user_id, uluv_to_aad[uluv]] }.to_h.compact
|
|
# NOTE: root_account here must be the same (values loaded into memory at the same time)
|
|
# as passed into UsersUluvsFinder AND as used in #tenant, for the "have settings changed?"
|
|
# check to work. For example, using course.root_account here would NOT be correct.
|
|
UserMapping.bulk_insert_for_root_account(group.root_account, user_id_to_aad)
|
|
end
|
|
end
|
|
|
|
# Get group members/owners from the API and local enrollments and calculate
|
|
# what needs to be done.
|
|
def step_generate_diff(_mem_data, _job_state_data)
|
|
members = graph_service_helpers.get_group_users_aad_ids(group.ms_group_id)
|
|
owners = graph_service_helpers.get_group_users_aad_ids(group.ms_group_id, owners: true)
|
|
|
|
diff = MembershipDiff.new(members, owners)
|
|
|
|
UserMapping.enrollments_and_aads(course).find_each do |enrollment|
|
|
diff.set_local_member(enrollment.aad_id, enrollment.type)
|
|
end
|
|
|
|
StateMachineJob::NextStep.new(:step_execute_diff, diff)
|
|
rescue *Errors::INTERMITTENT_AND_NOTFOUND => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
def log_batch_skipped(type, users)
|
|
return unless users # GraphService batch functions return nil if all succesful
|
|
|
|
n_total = users.values.map(&:length).sum
|
|
Rails.logger.warn("#{self.class.name} (#{group.global_id}): " \
|
|
"Skipping redundant #{type} for #{n_total}: #{users.to_json}")
|
|
InstStatsd::Statsd.increment("#{STATSD_NAME_SKIPPED_BATCHES}.#{type}",
|
|
tags: { sync_type: sync_type })
|
|
InstStatsd::Statsd.count("#{STATSD_NAME_SKIPPED_TOTAL}.#{type}", n_total,
|
|
tags: { sync_type: sync_type })
|
|
end
|
|
|
|
# Run the API calls to add/remove users.
|
|
def step_execute_diff(diff, _job_state_data)
|
|
# TODO: If there are no instructor enrollments, we actually want to
|
|
# remove the group on the Microsoft side (INTEROP-6672)
|
|
raise Errors::MissingOwners if diff.local_owners.empty?
|
|
|
|
raise_and_disable_group(MaxMemberEnrollmentsReached) if diff.max_enrollment_members_reached?
|
|
raise_and_disable_group(MaxOwnerEnrollmentsReached) if diff.max_enrollment_owners_reached?
|
|
|
|
execute_diff(diff)
|
|
|
|
StateMachineJob::NextStep.new(:step_check_team_exists)
|
|
rescue *Errors::INTERMITTENT_AND_NOTFOUND => e
|
|
retry_object_for_error(e, step: :step_generate_diff)
|
|
end
|
|
|
|
def raise_and_disable_group(error_class)
|
|
err = error_class.new
|
|
# Need to manually update last_error; StateMachineJob won't do it since the group
|
|
# will be in a 'deleted' state
|
|
group.update last_error: MicrosoftSync::Errors.serialize(err)
|
|
group.destroy
|
|
raise err
|
|
end
|
|
|
|
# Execute a MembershipDiff or PartialMembershipDiff -- add and remove
|
|
# users in batches
|
|
def execute_diff(diff)
|
|
execute_diff_remove_users(diff)
|
|
execute_diff_add_users(diff)
|
|
rescue Errors::MissingOwners
|
|
# If the group is close to the max number of users, we might need to
|
|
# remove users first to make room for new users.
|
|
# e.g.: group has 25000 users, course has 100 removed but 1 added. Need
|
|
# to remove at least 1 user before we can add 1.
|
|
#
|
|
# However, Microsoft will not let you remove the last owner in a group.
|
|
# So if a course has 1 owner and it is swapped out for a different owner,
|
|
# we should add the new one first. This is a rare scenario and because
|
|
# the Microsoft API is eventually consistent, we'd have to wait a bit to
|
|
# remove the old owner. So just add the new owners, raise the error and
|
|
# have them manually re-sync.
|
|
execute_diff_add_users(diff)
|
|
raise
|
|
end
|
|
|
|
def execute_diff_add_users(diff)
|
|
diff.additions_in_slices_of(GraphService::GROUP_USERS_BATCH_SIZE) do |members_and_owners|
|
|
skipped = graph_service.add_users_to_group_ignore_duplicates(
|
|
group.ms_group_id, **members_and_owners
|
|
)
|
|
log_batch_skipped(:add, skipped)
|
|
end
|
|
rescue Errors::MembersQuotaExceeded
|
|
raise_and_disable_group(MaxMemberEnrollmentsReached)
|
|
rescue Errors::OwnersQuotaExceeded
|
|
raise_and_disable_group(MaxOwnerEnrollmentsReached)
|
|
end
|
|
|
|
def execute_diff_remove_users(diff)
|
|
diff.removals_in_slices_of(GraphService::GROUP_USERS_BATCH_SIZE) do |members_and_owners|
|
|
skipped = graph_service.remove_group_users_ignore_missing(
|
|
group.ms_group_id, **members_and_owners
|
|
)
|
|
log_batch_skipped(:remove, skipped)
|
|
end
|
|
end
|
|
|
|
def step_check_team_exists(_mem_data, _job_state_data)
|
|
if course.enrollments.where(type: MembershipDiff::OWNER_ENROLLMENT_TYPES).any? \
|
|
&& !graph_service.team_exists?(group.ms_group_id)
|
|
StateMachineJob::DelayedNextStep.new(:step_create_team, DELAY_BEFORE_CREATE_TEAM)
|
|
else
|
|
StateMachineJob::COMPLETE
|
|
end
|
|
rescue *Errors::INTERMITTENT => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
def step_create_team(_mem_data, _job_state_data)
|
|
graph_service.create_education_class_team(group.ms_group_id)
|
|
StateMachineJob::COMPLETE
|
|
rescue MicrosoftSync::Errors::TeamAlreadyExists
|
|
StateMachineJob::COMPLETE
|
|
rescue MicrosoftSync::Errors::GroupHasNoOwners, MicrosoftSync::Errors::HTTPNotFound => e
|
|
# API is eventually consistent: We often have to wait a couple minutes
|
|
# after creating the group and adding owners for the Teams API to see the
|
|
# group and owners.
|
|
# It's also possible for the course to have added owners (so the
|
|
# enrollments are in the DB) since we last calculated the diff and added them
|
|
# in the generate_diff step. This is rare, but we can also sleep in that
|
|
# case. We'll eventually fail but the team will be created next time we sync.
|
|
StateMachineJob::Retry.new(error: e, delay_amount: [30, 90, 270])
|
|
rescue *Errors::INTERMITTENT => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
# Returns array of arrays of [user_id, change_enrollment_type, updated_at]
|
|
# [[234, 'member', timestamp1], [456, 'member', timestamp2], [456, 'owner', timestamp3]]
|
|
def load_partial_sync_changes
|
|
PartialSyncChange.where(course: course).limit(MAX_PARTIAL_SYNC_CHANGES + 1)
|
|
.pluck(:user_id, :enrollment_type, :updated_at)
|
|
end
|
|
|
|
def step_partial_sync(_mem_state, _job_state)
|
|
# Step 1. Kick off a full sync if we haven't created a group yet, or if
|
|
# there are too many changes to effectively handle here.
|
|
if group.ms_group_id.nil? ||
|
|
(changes = load_partial_sync_changes).length > MAX_PARTIAL_SYNC_CHANGES
|
|
InstStatsd::Statsd.increment("#{STATSD_NAME}.partial_into_full")
|
|
return StateMachineJob::NextStep.new(:step_full_sync_prerequisites)
|
|
end
|
|
|
|
return StateMachineJob::COMPLETE if changes.empty?
|
|
|
|
# Set sync_type before graph_service used (created) but after we may switch to full sync:
|
|
self.sync_type = 'partial'
|
|
|
|
# Step 2. ensure users have aad object ids:
|
|
# changes_by_user_id is a hash from user_id ->
|
|
# array of arrays of [user_id, change_enrollment_type, updated_at]:
|
|
changes_by_user_id = changes.group_by(&:first)
|
|
user_ids = changes_by_user_id.keys
|
|
ensure_user_mappings(UserMapping.user_ids_without_mappings(user_ids, course.root_account_id))
|
|
|
|
# Step 3. Generate PartialMembership diff by combining three mappings:
|
|
# user_id -> change type from PartialSyncChanges
|
|
# user_id <-> aad id from UserMappings (UserMapping.enrollments_aad_aads)
|
|
# user_id -> enrollment types from Enrollments (UserMapping.enrollments_aad_aads)
|
|
# We need to look at all Enrollments for each user, not just those of the change type
|
|
# because the user could be removed as an owner but not as a member, etc.
|
|
|
|
# e.g., { 123 => ['member', 'owner'], 1230000000000099 => ['owner']}
|
|
users_to_msft_role_types = changes_by_user_id.transform_values { |chgs| chgs.map(&:second) }
|
|
diff = PartialMembershipDiff.new(users_to_msft_role_types)
|
|
|
|
mappings = UserMapping.where(root_account_id: course.root_account_id, user_id: user_ids)
|
|
.pluck(:user_id, :aad_id)
|
|
mappings.each { |user_id, aad_id| diff.set_member_mapping(user_id, aad_id) }
|
|
|
|
users_with_mappings = mappings.map(&:first)
|
|
enrollments = Enrollment.microsoft_sync_relevant
|
|
.where(course: course, user_id: users_with_mappings)
|
|
.pluck(:user_id, :type)
|
|
enrollments.each { |user_id, enrollment_type| diff.set_local_member(user_id, enrollment_type) }
|
|
|
|
# Step 4. Execute diff and delete changes. Match additionally on
|
|
# updated_at to avoid deleting any changes that have updated since we
|
|
# started (and will get executed on the next run)
|
|
diff.log_all_actions
|
|
execute_diff(diff)
|
|
PartialSyncChange
|
|
.where(course: course)
|
|
.with_values_in(%w[user_id enrollment_type updated_at], changes)
|
|
.delete_all
|
|
|
|
StateMachineJob::COMPLETE
|
|
rescue Errors::Throttled => e
|
|
# If throttled, it's likely due to adds/removes. the fact we didn't
|
|
# delete changes will mean retrying will cause us to fail more and probably
|
|
# get throttled more. It will save quota to do a full sync, even if it means
|
|
# incurring more read quota from getting the list of users in a group
|
|
# (generally, cheaper).
|
|
full_sync_after = e.retry_after_seconds || STANDARD_RETRY_DELAY
|
|
Rails.logger.info 'MicrosoftSync::SyncerSteps: partial sync throttled, ' \
|
|
"full sync in #{full_sync_after}"
|
|
InstStatsd::Statsd.increment("#{STATSD_NAME}.partial_into_full_throttled")
|
|
StateMachineJob::DelayedNextStep.new(:step_full_sync_prerequisites, full_sync_after)
|
|
rescue *Errors::INTERMITTENT_AND_NOTFOUND => e
|
|
retry_object_for_error(e)
|
|
end
|
|
|
|
# Only serialize Group (AR model, so really just Group id) when enqueueing
|
|
# a job. The rest of the instance variables should be reloaded when the job
|
|
# starts again.
|
|
def encode_with(coder)
|
|
coder['group'] = @group
|
|
end
|
|
|
|
private
|
|
|
|
attr_writer :sync_type
|
|
|
|
def sync_type
|
|
@sync_type || 'full'
|
|
end
|
|
|
|
def tenant
|
|
@tenant ||=
|
|
begin
|
|
enabled = account_settings[:microsoft_sync_enabled]
|
|
tenant = account_settings[:microsoft_sync_tenant]
|
|
raise TenantMissingOrSyncDisabled unless enabled && tenant
|
|
|
|
tenant
|
|
end
|
|
end
|
|
|
|
def account_settings
|
|
@account_settings ||= group.root_account.settings
|
|
end
|
|
|
|
def graph_service_helpers
|
|
@graph_service_helpers ||= tenant && GraphServiceHelpers.new(tenant, sync_type: sync_type)
|
|
end
|
|
|
|
def graph_service
|
|
@graph_service ||= graph_service_helpers.graph_service
|
|
end
|
|
end
|
|
end
|