extract smart_search embeddings functionality from WikiPage

in preparation for supporting other object types

test plan:
 - smoke test smart search functionality

flag=smart_search
closes ADV-30

Change-Id: I6350a45ccfe905afb0723d119edc3d58d5a13507
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/342544
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
QA-Review: Jeremy Stanley <jeremy@instructure.com>
Product-Review: Jeremy Stanley <jeremy@instructure.com>
Reviewed-by: Jonathan Featherstone <jfeatherstone@instructure.com>
This commit is contained in:
Jeremy Stanley 2024-03-08 15:38:47 -07:00
parent 9a89ade898
commit 782895e6c0
6 changed files with 206 additions and 118 deletions

View File

@ -46,6 +46,9 @@ class WikiPage < ActiveRecord::Base
restrict_columns :state, [:workflow_state]
restrict_columns :availability_dates, [:publish_at]
include SmartSearchable
use_smart_search :title, :body
after_update :post_to_pandapub_when_revised
belongs_to :wiki, touch: true
@ -56,7 +59,6 @@ class WikiPage < ActiveRecord::Base
belongs_to :current_lookup, class_name: "WikiPageLookup"
has_many :wiki_page_lookups, inverse_of: :wiki_page
has_many :wiki_page_embeddings, inverse_of: :wiki_page
has_one :master_content_tag, class_name: "MasterCourses::MasterContentTag", inverse_of: :wiki_page
has_one :block_editor, as: :context, dependent: :destroy
accepts_nested_attributes_for :block_editor, allow_destroy: true
@ -77,8 +79,6 @@ class WikiPage < ActiveRecord::Base
if: proc { context.try(:conditional_release?) }
after_save :create_lookup, if: :should_create_lookup?
after_save :delete_lookups, if: -> { !Account.site_admin.feature_enabled?(:permanent_page_links) && saved_change_to_workflow_state? && deleted? }
after_save :generate_embeddings, if: :should_generate_embeddings?
after_save :delete_embeddings, if: -> { deleted? && saved_change_to_workflow_state? }
scope :starting_with_title, lambda { |title|
where("title ILIKE ?", "#{title}%")
@ -106,60 +106,6 @@ class WikiPage < ActiveRecord::Base
self.wiki_id ||= context.wiki_id || context.wiki.id
end
def should_generate_embeddings?
return false if deleted?
return false unless SmartSearch.smart_search_available?(context)
saved_change_to_body? ||
saved_change_to_title? ||
(saved_change_to_workflow_state? && workflow_state_before_last_save == "deleted")
end
def chunk_content(max_character_length = 4000)
if body_text.length > max_character_length
# Chunk
# Hard split on character length, back up to the nearest word boundary
remaining_text = body_text
while remaining_text
# Find the last space before the max length
last_space = remaining_text.rindex(/\b/, max_character_length)
if last_space.nil? || last_space < max_character_length / 2
# No space found, or no space found in a reasonable distance, so just split at max length
last_space = max_character_length
end
yield title + "\n" + remaining_text[0..last_space]
remaining_text = remaining_text[(last_space + 1)..]
end
else
# No need for chunking
yield title + "\n" + body_text
end
end
def body_text
html_to_text(body)
end
def generate_embeddings
delete_embeddings
chunk_content do |chunk|
embedding = SmartSearch.generate_embedding(chunk)
wiki_page_embeddings.create!(embedding:)
end
end
handle_asynchronously :generate_embeddings, priority: Delayed::LOW_PRIORITY
def delete_embeddings
return unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
# TODO: delete via the association once pgvector is available everywhere
# (without :dependent, that would try to nullify the fk in violation of the constraint
# but with :dependent, instances without pgvector would try to access the nonexistent table when a page is deleted)
shard.activate do
WikiPageEmbedding.where(wiki_page_id: self).delete_all
end
end
def context
if !association(:context).loaded? &&
association(:wiki).loaded? &&

View File

@ -17,7 +17,7 @@
# with this program. If not, see <http://www.gnu.org/licenses/>.
class WikiPageEmbedding < ApplicationRecord
belongs_to :wiki_page, inverse_of: :wiki_page_embeddings
belongs_to :wiki_page, inverse_of: :embeddings
has_neighbors :embedding # TODO: Implement has_neighbors on wikipage object instead (with through?)
extend RootAccountResolver

View File

@ -75,7 +75,7 @@ module SmartSearch
def index_course(course)
# index non-deleted pages (that have not already been indexed)
course.wiki_pages.not_deleted
.where.missing(:wiki_page_embeddings)
.where.missing(:embeddings)
.find_each do |page|
page.generate_embeddings(synchronous: true)
end

119
lib/smart_searchable.rb Normal file
View File

@ -0,0 +1,119 @@
# frozen_string_literal: true
# Copyright (C) 2024 - present Instructure, Inc.
#
# This file is part of Canvas.
#
# Canvas is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, version 3 of the License.
#
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
# to add smart search to a model:
# 1. `include SmartSearchable`
# 2. `use_smart_search :title, :body` # replace with the actual column names in the model
# 3. create the embeddings model and table (with fk/index)
# 4. add to `index_course` in lib/smart_search.rb
# 5. add a query in SmartSearchController
# model requirements:
# 1. Course association
# 2. uses Workflow
# 3. SoftDeletable
module SmartSearchable
def self.included(klass)
klass.class_eval do
extend ClassMethods
end
end
module ClassMethods
def use_smart_search(title_column, body_column)
class_eval do
include HtmlTextHelper
has_many :embeddings, class_name: embedding_class_name, inverse_of: table_name.singularize.to_sym
cattr_accessor :smart_search_title_column, :smart_search_body_column
after_save :generate_embeddings, if: :should_generate_embeddings?
after_save :delete_embeddings, if: -> { deleted? && saved_change_to_workflow_state? }
end
self.smart_search_title_column = title_column.to_s
self.smart_search_body_column = body_column.to_s
end
def embedding_class_name
"#{class_name}Embedding"
end
def embedding_class
@embedding_class ||= embedding_class_name.constantize
end
def embedding_foreign_key
@embedding_fk ||= :"#{table_name.singularize}_id"
end
end
def should_generate_embeddings?
return false if deleted?
return false unless SmartSearch.smart_search_available?(context)
saved_changes.key?(self.class.smart_search_title_column) || saved_changes.key?(self.class.smart_search_body_column) ||
(saved_change_to_workflow_state? && workflow_state_before_last_save == "deleted")
end
def generate_embeddings
delete_embeddings
chunk_content do |chunk|
embedding = SmartSearch.generate_embedding(chunk)
embeddings.create!(embedding:)
end
end
handle_asynchronously :generate_embeddings, priority: Delayed::LOW_PRIORITY
def chunk_content(max_character_length = 4000)
title = attributes[self.class.smart_search_title_column]
content = body_text
if content.length > max_character_length
# Chunk
# Hard split on character length, back up to the nearest word boundary
remaining_text = content
while remaining_text
# Find the last space before the max length
last_space = remaining_text.rindex(/\b/, max_character_length)
if last_space.nil? || last_space < max_character_length / 2
# No space found, or no space found in a reasonable distance, so just split at max length
last_space = max_character_length
end
# include the title in each chunk
yield title + "\n" + remaining_text[0..last_space]
remaining_text = remaining_text[(last_space + 1)..]
end
else
# No need for chunking
yield title + "\n" + content
end
end
def body_text
html_to_text(attributes[self.class.smart_search_body_column])
end
def delete_embeddings
return unless ActiveRecord::Base.connection.table_exists?(self.class.embedding_class.table_name)
# TODO: delete via the association once pgvector is available everywhere
# (without :dependent, that would try to nullify the fk in violation of the constraint
# but with :dependent, instances without pgvector would try to access the nonexistent table when a page is deleted)
shard.activate do
self.class.embedding_class.where(self.class.embedding_foreign_key => self).delete_all
end
end
end

View File

@ -0,0 +1,82 @@
# frozen_string_literal: true
#
# Copyright (C) 2024 - present Instructure, Inc.
#
# This file is part of Canvas.
#
# Canvas is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, version 3 of the License.
#
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
require_relative "../spec_helper"
describe SmartSearchable do
describe "#generate_embeddings" do
before do
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
allow(SmartSearch).to receive(:generate_embedding).and_return([1] * 1536)
expect(SmartSearch).to receive(:api_key).at_least(:once).and_return("fake_api_key")
end
before :once do
course_factory
@course.enable_feature! :smart_search
end
it "generates an embedding when creating a page" do
wiki_page_model(title: "test", body: "foo")
run_jobs
expect(@page.reload.embeddings.count).to eq 1
end
it "replaces an embedding if it already exists" do
wiki_page_model(title: "test", body: "foo")
run_jobs
@page.update body: "bar"
run_jobs
expect(@page.reload.embeddings.count).to eq 1
end
it "strips HTML from the body before indexing" do
wiki_page_model(title: "test", body: "<ul><li>foo</li></ul>")
expect(SmartSearch).to receive(:generate_embedding).with("test\n* foo")
run_jobs
end
it "deletes embeddings when a page is deleted (and regenerates them when undeleted)" do
wiki_page_model(title: "test", body: "foo")
run_jobs
@page.destroy
expect(@page.reload.embeddings.count).to eq 0
@page.restore
run_jobs
expect(@page.reload.embeddings.count).to eq 1
end
it "generates multiple embeddings for a page with long content" do
wiki_page_model(title: "test", body: "foo" * 2000)
run_jobs
expect(@page.reload.embeddings.count).to eq 2
end
it "generates multiple embeddings and doesn't split words" do
# 7997 bytes in total, would fit into two 4000-byte pages,
# but word splitting will push it into 3
wiki_page_model(title: "test", body: "testing123 " * 727)
run_jobs
expect(@page.reload.embeddings.count).to eq 3
end
end
end

View File

@ -1098,63 +1098,4 @@ describe WikiPage do
end
end
end
describe "#generate_embeddings" do
before do
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
allow(SmartSearch).to receive(:generate_embedding).and_return([1] * 1536)
expect(SmartSearch).to receive(:api_key).at_least(:once).and_return("fake_api_key")
end
before :once do
course_factory
@course.enable_feature! :smart_search
end
it "generates an embedding when creating a page" do
wiki_page_model(title: "test", body: "foo")
run_jobs
expect(@page.reload.wiki_page_embeddings.count).to eq 1
end
it "replaces an embedding if it already exists" do
wiki_page_model(title: "test", body: "foo")
run_jobs
@page.update body: "bar"
run_jobs
expect(@page.reload.wiki_page_embeddings.count).to eq 1
end
it "strips HTML from the body before indexing" do
wiki_page_model(title: "test", body: "<ul><li>foo</li></ul>")
expect(SmartSearch).to receive(:generate_embedding).with("test\n* foo")
run_jobs
end
it "deletes embeddings when a page is deleted (and regenerates them when undeleted)" do
wiki_page_model(title: "test", body: "foo")
run_jobs
@page.destroy
expect(@page.reload.wiki_page_embeddings.count).to eq 0
@page.restore
run_jobs
expect(@page.reload.wiki_page_embeddings.count).to eq 1
end
it "generates multiple embeddings for a page with long content" do
wiki_page_model(title: "test", body: "foo" * 2000)
run_jobs
expect(@page.reload.wiki_page_embeddings.count).to eq 2
end
it "generates multiple embeddings and doesn't split words" do
# 7997 bytes in total, would fit into two 4000-byte pages,
# but word splitting will push it into 3
wiki_page_model(title: "test", body: "testing123 " * 727)
run_jobs
expect(@page.reload.wiki_page_embeddings.count).to eq 3
end
end
end