smart search: copy embeddings in course copy/blueprint sync

test plan:
 - have a course with searchable objects of each type
   (page, assignment, discussion topic, announcement)
 - do a search from this course to ensure the source
   course embeddings are up to date
 - copy the course to a destination course where the
   smart search feature flag is enabled
 - run a search in the destination course and confirm
   the progress bar quickly finishes if you see it at all
   (we still have to check it because it's possible to
   do a course copy into an existing shell that is
   missing embeddings)
 - copy the course into a destination course where the
   smart search feature flag is *not* enabled
 - verify there are no embeddings created or copied
   into the new destination

flag=smart_search
closes ADV-57

Change-Id: Ia8d040506359d2c4bd7718e19ba135c04a458af2
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/348353
Reviewed-by: Mysti Lilla <mysti@instructure.com>
Reviewed-by: Jonathan Featherstone <jfeatherstone@instructure.com>
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
QA-Review: Jeremy Stanley <jeremy@instructure.com>
Product-Review: Jeremy Stanley <jeremy@instructure.com>
This commit is contained in:
Jeremy Stanley 2024-05-24 14:07:31 -06:00
parent 669c4cabac
commit 65f566dbb3
6 changed files with 225 additions and 16 deletions

View File

@ -644,6 +644,7 @@ class ContentMigration < ActiveRecord::Base
import!(data)
process_master_deletions(deletions.slice("LearningOutcome", "AssignmentGroup")) if deletions
SmartSearch.copy_embeddings(self) if for_master_course_import?
unless import_immediately?
update_import_progress(100)
@ -1354,6 +1355,53 @@ class ContentMigration < ActiveRecord::Base
save!
end
# this is a much-simplified subset of asset_id_mapping that includes only ids,
# for only the items imported in this migration
def imported_asset_id_map
return nil unless imported? && for_course_copy?
mapping = {}
master_template = migration_type == "master_course_import" &&
master_course_subscription&.master_template
global_ids = master_template.present? || use_global_identifiers?
migration_settings[:imported_assets].each do |asset_type, dest_ids|
klass = asset_type.constantize
next unless klass.column_names.include? "migration_id"
dest_ids = dest_ids.split(",").map(&:to_i)
mig_id_to_dest_id = context.shard.activate do
scope = klass.where(context:, id: dest_ids)
scope = scope.only_discussion_topics if asset_type == "DiscussionTopic"
scope.where.not(migration_id: nil)
.pluck(:migration_id, :id)
.to_h
end
mapping[asset_type] ||= {}
if master_template
master_template.master_content_tags
.where(migration_id: mig_id_to_dest_id.keys)
.pluck(:content_id, :migration_id)
.each do |src_id, mig_id|
mapping[asset_type][src_id] = mig_id_to_dest_id[mig_id]
end
else
source_course.shard.activate do
src_ids = klass.where(context: source_course).pluck(:id)
src_ids.each do |src_id|
asset_string = klass.asset_string(src_id)
mig_id = CC::CCHelper.create_key(asset_string, global: global_ids)
dest_id = mig_id_to_dest_id[mig_id]
mapping[asset_type][src_id] = dest_id if dest_id
end
end
end
end
mapping
end
def destination_hosts
return [] unless context

View File

@ -87,6 +87,10 @@ module MasterCourses::Restrictor
def mark_as_importing!(cm)
@importing_migration = cm if cm&.master_course_subscription
# if we are doing a course copy and the source course has up-to-date search embeddings,
# we will copy those embeddings in batches instead of regenerating them
self.skip_embeddings = true if cm&.for_course_copy? && respond_to?(:skip_embeddings=) &&
SmartSearch.up_to_date?(cm.source_course)
end
def skip_downstream_changes!

View File

@ -52,6 +52,8 @@ class Canvas::Migration::Worker::CourseCopyWorker < Canvas::Migration::Worker::B
cm.update_import_progress(20)
cm.import_content
SmartSearch.copy_embeddings(cm)
cm.workflow_state = :imported
cm.save
cm.update_import_progress(100)

View File

@ -162,6 +162,10 @@ module SmartSearch
end
end
def up_to_date?(course)
smart_search_available?(course) && course.search_embedding_version == SmartSearch::EMBEDDING_VERSION
end
# returns [ready, progress]
# progress may be < 100 while ready if upgrading embeddings
def check_course(course)
@ -221,5 +225,45 @@ module SmartSearch
(indexed * 100.0 / total).to_i
end
def copy_embeddings(content_migration)
return unless content_migration.for_course_copy? &&
content_migration.source_course&.search_embedding_version == EMBEDDING_VERSION &&
SmartSearch.smart_search_available?(content_migration.context)
content_migration.imported_asset_id_map&.each do |class_name, id_mapping|
klass = class_name.constantize
next unless klass.respond_to?(:embedding_class)
fk = klass.embedding_foreign_key # i.e. :wiki_page_id
content_migration.context.shard.activate do
klass.embedding_class
.where(:version => EMBEDDING_VERSION, fk => id_mapping.values)
.in_batches
.delete_all
end
content_migration.source_course.shard.activate do
klass.embedding_class.where(:version => EMBEDDING_VERSION, fk => id_mapping.keys)
.find_in_batches(batch_size: 50) do |src_embeddings|
dest_embeddings = src_embeddings.map do |src_embedding|
{
:embedding => src_embedding.embedding,
fk => id_mapping[src_embedding[fk]],
:version => EMBEDDING_VERSION,
:root_account_id => content_migration.context.root_account_id,
:created_at => Time.now.utc,
:updated_at => Time.now.utc
}
end
content_migration.context.shard.activate do
klass.embedding_class.insert_all(dest_embeddings)
end
end
end
end
end
end
end

View File

@ -41,6 +41,8 @@ module SmartSearchable
include HtmlTextHelper
has_many :embeddings, class_name: embedding_class_name, inverse_of: table_name.singularize.to_sym
cattr_accessor :search_title_column, :search_body_column
attr_accessor :skip_embeddings
after_save :generate_embeddings, if: :should_generate_embeddings?
after_save :delete_embeddings, if: -> { deleted? && saved_change_to_workflow_state? }
end
@ -63,7 +65,7 @@ module SmartSearchable
end
def should_generate_embeddings?
return false if deleted?
return false if deleted? || skip_embeddings
return false unless SmartSearch.smart_search_available?(context)
saved_changes.key?(self.class.search_title_column) || saved_changes.key?(self.class.search_body_column) ||

View File

@ -21,26 +21,29 @@
require_relative "../spec_helper"
describe SmartSearch do
before do
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
allow(SmartSearch).to receive(:generate_embedding) { |input| input.chars.map(&:ord).fill(0, input.size...1024).slice(0...1024) }
allow(SmartSearch).to receive(:bedrock_client).and_return(double)
end
before :once do
course_factory
@course.wiki_pages.create! title: "red pandas", body: "foo " * 400
@course.assignments.create! name: "horse feathers", description: "..."
@course.assignments.create! name: "hungry hippos", description: "..."
@course.discussion_topics.create! title: "!!!", message: "..."
@course.announcements.create! title: "hear ye", message: "..."
@course.enable_feature! :smart_search
end
describe "#index_course" do
before do
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
allow(SmartSearch).to receive(:generate_embedding).and_return([1] * 1024)
end
before :once do
course_factory
@course.wiki_pages.create! title: "red pandas", body: "foo " * 400
@course.assignments.create! name: "horse feathers", description: "..."
@course.assignments.create! name: "hungry hippos", description: "..."
@course.discussion_topics.create! title: "!!!", message: "..."
@course.enable_feature! :smart_search
end
it "indexes a new course" do
SmartSearch.index_course(@course)
expect(WikiPageEmbedding.where(wiki_page_id: @course.wiki_pages.select(:id)).count).to eq 2
expect(AssignmentEmbedding.where(assignment_id: @course.assignments.select(:id)).count).to eq 2
expect(DiscussionTopicEmbedding.where(discussion_topic_id: @course.discussion_topics.select(:id)).count).to eq 1
expect(DiscussionTopicEmbedding.where(discussion_topic_id: @course.discussion_topics.select(:id)).count).to eq 2
end
it "indexes only missing items" do
@ -59,4 +62,110 @@ describe SmartSearch do
expect(@course.assignments.first.embeddings.first.version).to eq SmartSearch::EMBEDDING_VERSION
end
end
describe "course copy" do
def run_course_copy(copy_from, copy_to)
@cm = ContentMigration.new(context: copy_to,
source_course: copy_from,
migration_type: "course_copy_importer",
copy_options: { everything: "1" })
@cm.migration_settings[:import_immediately] = true
@cm.set_default_settings
@cm.save!
worker = Canvas::Migration::Worker::CourseCopyWorker.new
worker.perform(@cm)
end
before :once do
next unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
@copy_from = @course
SmartSearch.index_course(@copy_from)
@copy_to = course_factory
end
context "destination does not enable smart search feature" do
it "does not create or copy embeddings" do
run_course_copy(@copy_from, @copy_to)
expect(WikiPageEmbedding.where(wiki_page_id: @copy_to.wiki_pages.select(:id)).count).to eq 0
expect(AssignmentEmbedding.where(assignment_id: @copy_to.assignments.select(:id)).count).to eq 0
expect(DiscussionTopicEmbedding.where(discussion_topic_id: @copy_to.discussion_topics.select(:id)).count).to eq 0
end
end
context "destination enables smart search feature" do
before :once do
@copy_to&.enable_feature! :smart_search
end
it "copies embeddings" do
# create some existing embeddings first so we can verify they are overwritten or ignored as appropriate
# we will simulate an assignment that was copied previously, and one that is not part of the migration
src_assignment = @copy_from.assignments.first
@copy_to.assignments.create! name: src_assignment.name, description: src_assignment.description, migration_id: CC::CCHelper.create_key(src_assignment, global: true)
distractor = @copy_to.assignments.create! name: "original", description: "..."
run_jobs
expect(distractor.embeddings.count).to eq 1
expect(SmartSearch).not_to receive(:generate_embedding)
run_course_copy(@copy_from, @copy_to)
expect(@copy_from.wiki_pages.first.embeddings.map(&:embedding).sort).to eq @copy_to.wiki_pages.first.embeddings.map(&:embedding).sort
expect(@copy_from.assignments.first.embeddings.map(&:embedding).sort).to eq @copy_to.assignments.first.embeddings.map(&:embedding).sort
expect(@copy_from.assignments.last.embeddings.map(&:embedding).sort).to eq @copy_to.assignments.last.embeddings.map(&:embedding).sort
expect(@copy_from.discussion_topics.only_discussion_topics.first.embeddings.map(&:embedding).sort).to eq @copy_to.discussion_topics.only_discussion_topics.first.embeddings.map(&:embedding).sort
expect(@copy_from.announcements.first.embeddings.map(&:embedding).sort).to eq @copy_to.announcements.first.embeddings.map(&:embedding).sort
expect(distractor.reload.embeddings.count).to eq 1
end
it "generates embeddings in the destination if the source course doesn't have them or is out of date" do
@copy_from.disable_feature! :smart_search
run_course_copy(@copy_from, @copy_to)
run_jobs
expect(SmartSearch).to have_received(:generate_embedding).at_least(6).times
expect(WikiPageEmbedding.where(wiki_page_id: @copy_to.wiki_pages.select(:id)).count).to eq 2
expect(AssignmentEmbedding.where(assignment_id: @copy_to.assignments.select(:id)).count).to eq 2
expect(DiscussionTopicEmbedding.where(discussion_topic_id: @copy_to.discussion_topics.select(:id)).count).to eq 2
end
end
end
describe "blueprint sync" do
def run_blueprint_sync(template)
@mm = MasterCourses::MasterMigration.start_new_migration!(template, nil)
run_jobs
@mm.reload
end
before :once do
next unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
@blueprint = @course
@template = MasterCourses::MasterTemplate.set_as_master_course(@blueprint)
@assoc1 = course_factory
@assoc2 = course_factory
@template.add_child_course!(@assoc1)
@template.add_child_course!(@assoc2)
@blueprint.enable_feature! :smart_search
@assoc1.enable_feature! :smart_search
SmartSearch.index_course(@blueprint)
end
it "copies embeddings to destinations that enable smart search" do
expect(SmartSearch).not_to receive(:generate_embedding)
run_blueprint_sync(@template)
expect(@blueprint.wiki_pages.first.embeddings.map(&:embedding).sort).to eq @assoc1.wiki_pages.first.embeddings.map(&:embedding).sort
expect(@blueprint.assignments.first.embeddings.map(&:embedding).sort).to eq @assoc1.assignments.first.embeddings.map(&:embedding).sort
expect(@blueprint.assignments.last.embeddings.map(&:embedding).sort).to eq @assoc1.assignments.last.embeddings.map(&:embedding).sort
expect(@blueprint.discussion_topics.only_discussion_topics.first.embeddings.map(&:embedding).sort).to eq @assoc1.discussion_topics.only_discussion_topics.first.embeddings.map(&:embedding).sort
expect(@blueprint.announcements.first.embeddings.map(&:embedding).sort).to eq @assoc1.announcements.first.embeddings.map(&:embedding).sort
expect(@assoc2.wiki_pages.first.embeddings).to be_empty
expect(@assoc2.assignments.first.embeddings).to be_empty
expect(@assoc2.assignments.last.embeddings).to be_empty
expect(@assoc2.discussion_topics.only_discussion_topics.first.embeddings).to be_empty
expect(@assoc2.announcements.first.embeddings).to be_empty
end
end
end