extract smart_search embeddings functionality from WikiPage
in preparation for supporting other object types test plan: - smoke test smart search functionality flag=smart_search closes ADV-30 Change-Id: I6350a45ccfe905afb0723d119edc3d58d5a13507 Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/342544 Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com> QA-Review: Jeremy Stanley <jeremy@instructure.com> Product-Review: Jeremy Stanley <jeremy@instructure.com> Reviewed-by: Jonathan Featherstone <jfeatherstone@instructure.com>
This commit is contained in:
parent
9a89ade898
commit
782895e6c0
|
@ -46,6 +46,9 @@ class WikiPage < ActiveRecord::Base
|
|||
restrict_columns :state, [:workflow_state]
|
||||
restrict_columns :availability_dates, [:publish_at]
|
||||
|
||||
include SmartSearchable
|
||||
use_smart_search :title, :body
|
||||
|
||||
after_update :post_to_pandapub_when_revised
|
||||
|
||||
belongs_to :wiki, touch: true
|
||||
|
@ -56,7 +59,6 @@ class WikiPage < ActiveRecord::Base
|
|||
|
||||
belongs_to :current_lookup, class_name: "WikiPageLookup"
|
||||
has_many :wiki_page_lookups, inverse_of: :wiki_page
|
||||
has_many :wiki_page_embeddings, inverse_of: :wiki_page
|
||||
has_one :master_content_tag, class_name: "MasterCourses::MasterContentTag", inverse_of: :wiki_page
|
||||
has_one :block_editor, as: :context, dependent: :destroy
|
||||
accepts_nested_attributes_for :block_editor, allow_destroy: true
|
||||
|
@ -77,8 +79,6 @@ class WikiPage < ActiveRecord::Base
|
|||
if: proc { context.try(:conditional_release?) }
|
||||
after_save :create_lookup, if: :should_create_lookup?
|
||||
after_save :delete_lookups, if: -> { !Account.site_admin.feature_enabled?(:permanent_page_links) && saved_change_to_workflow_state? && deleted? }
|
||||
after_save :generate_embeddings, if: :should_generate_embeddings?
|
||||
after_save :delete_embeddings, if: -> { deleted? && saved_change_to_workflow_state? }
|
||||
|
||||
scope :starting_with_title, lambda { |title|
|
||||
where("title ILIKE ?", "#{title}%")
|
||||
|
@ -106,60 +106,6 @@ class WikiPage < ActiveRecord::Base
|
|||
self.wiki_id ||= context.wiki_id || context.wiki.id
|
||||
end
|
||||
|
||||
def should_generate_embeddings?
|
||||
return false if deleted?
|
||||
return false unless SmartSearch.smart_search_available?(context)
|
||||
|
||||
saved_change_to_body? ||
|
||||
saved_change_to_title? ||
|
||||
(saved_change_to_workflow_state? && workflow_state_before_last_save == "deleted")
|
||||
end
|
||||
|
||||
def chunk_content(max_character_length = 4000)
|
||||
if body_text.length > max_character_length
|
||||
# Chunk
|
||||
# Hard split on character length, back up to the nearest word boundary
|
||||
remaining_text = body_text
|
||||
while remaining_text
|
||||
# Find the last space before the max length
|
||||
last_space = remaining_text.rindex(/\b/, max_character_length)
|
||||
if last_space.nil? || last_space < max_character_length / 2
|
||||
# No space found, or no space found in a reasonable distance, so just split at max length
|
||||
last_space = max_character_length
|
||||
end
|
||||
yield title + "\n" + remaining_text[0..last_space]
|
||||
remaining_text = remaining_text[(last_space + 1)..]
|
||||
end
|
||||
else
|
||||
# No need for chunking
|
||||
yield title + "\n" + body_text
|
||||
end
|
||||
end
|
||||
|
||||
def body_text
|
||||
html_to_text(body)
|
||||
end
|
||||
|
||||
def generate_embeddings
|
||||
delete_embeddings
|
||||
chunk_content do |chunk|
|
||||
embedding = SmartSearch.generate_embedding(chunk)
|
||||
wiki_page_embeddings.create!(embedding:)
|
||||
end
|
||||
end
|
||||
handle_asynchronously :generate_embeddings, priority: Delayed::LOW_PRIORITY
|
||||
|
||||
def delete_embeddings
|
||||
return unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
|
||||
|
||||
# TODO: delete via the association once pgvector is available everywhere
|
||||
# (without :dependent, that would try to nullify the fk in violation of the constraint
|
||||
# but with :dependent, instances without pgvector would try to access the nonexistent table when a page is deleted)
|
||||
shard.activate do
|
||||
WikiPageEmbedding.where(wiki_page_id: self).delete_all
|
||||
end
|
||||
end
|
||||
|
||||
def context
|
||||
if !association(:context).loaded? &&
|
||||
association(:wiki).loaded? &&
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
class WikiPageEmbedding < ApplicationRecord
|
||||
belongs_to :wiki_page, inverse_of: :wiki_page_embeddings
|
||||
belongs_to :wiki_page, inverse_of: :embeddings
|
||||
has_neighbors :embedding # TODO: Implement has_neighbors on wikipage object instead (with through?)
|
||||
|
||||
extend RootAccountResolver
|
||||
|
|
|
@ -75,7 +75,7 @@ module SmartSearch
|
|||
def index_course(course)
|
||||
# index non-deleted pages (that have not already been indexed)
|
||||
course.wiki_pages.not_deleted
|
||||
.where.missing(:wiki_page_embeddings)
|
||||
.where.missing(:embeddings)
|
||||
.find_each do |page|
|
||||
page.generate_embeddings(synchronous: true)
|
||||
end
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
# Copyright (C) 2024 - present Instructure, Inc.
|
||||
#
|
||||
# This file is part of Canvas.
|
||||
#
|
||||
# Canvas is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, version 3 of the License.
|
||||
#
|
||||
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
||||
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# to add smart search to a model:
|
||||
# 1. `include SmartSearchable`
|
||||
# 2. `use_smart_search :title, :body` # replace with the actual column names in the model
|
||||
# 3. create the embeddings model and table (with fk/index)
|
||||
# 4. add to `index_course` in lib/smart_search.rb
|
||||
# 5. add a query in SmartSearchController
|
||||
|
||||
# model requirements:
|
||||
# 1. Course association
|
||||
# 2. uses Workflow
|
||||
# 3. SoftDeletable
|
||||
|
||||
module SmartSearchable
|
||||
def self.included(klass)
|
||||
klass.class_eval do
|
||||
extend ClassMethods
|
||||
end
|
||||
end
|
||||
|
||||
module ClassMethods
|
||||
def use_smart_search(title_column, body_column)
|
||||
class_eval do
|
||||
include HtmlTextHelper
|
||||
has_many :embeddings, class_name: embedding_class_name, inverse_of: table_name.singularize.to_sym
|
||||
cattr_accessor :smart_search_title_column, :smart_search_body_column
|
||||
after_save :generate_embeddings, if: :should_generate_embeddings?
|
||||
after_save :delete_embeddings, if: -> { deleted? && saved_change_to_workflow_state? }
|
||||
end
|
||||
self.smart_search_title_column = title_column.to_s
|
||||
self.smart_search_body_column = body_column.to_s
|
||||
end
|
||||
|
||||
def embedding_class_name
|
||||
"#{class_name}Embedding"
|
||||
end
|
||||
|
||||
def embedding_class
|
||||
@embedding_class ||= embedding_class_name.constantize
|
||||
end
|
||||
|
||||
def embedding_foreign_key
|
||||
@embedding_fk ||= :"#{table_name.singularize}_id"
|
||||
end
|
||||
end
|
||||
|
||||
def should_generate_embeddings?
|
||||
return false if deleted?
|
||||
return false unless SmartSearch.smart_search_available?(context)
|
||||
|
||||
saved_changes.key?(self.class.smart_search_title_column) || saved_changes.key?(self.class.smart_search_body_column) ||
|
||||
(saved_change_to_workflow_state? && workflow_state_before_last_save == "deleted")
|
||||
end
|
||||
|
||||
def generate_embeddings
|
||||
delete_embeddings
|
||||
chunk_content do |chunk|
|
||||
embedding = SmartSearch.generate_embedding(chunk)
|
||||
embeddings.create!(embedding:)
|
||||
end
|
||||
end
|
||||
handle_asynchronously :generate_embeddings, priority: Delayed::LOW_PRIORITY
|
||||
|
||||
def chunk_content(max_character_length = 4000)
|
||||
title = attributes[self.class.smart_search_title_column]
|
||||
content = body_text
|
||||
if content.length > max_character_length
|
||||
# Chunk
|
||||
# Hard split on character length, back up to the nearest word boundary
|
||||
remaining_text = content
|
||||
while remaining_text
|
||||
# Find the last space before the max length
|
||||
last_space = remaining_text.rindex(/\b/, max_character_length)
|
||||
if last_space.nil? || last_space < max_character_length / 2
|
||||
# No space found, or no space found in a reasonable distance, so just split at max length
|
||||
last_space = max_character_length
|
||||
end
|
||||
# include the title in each chunk
|
||||
yield title + "\n" + remaining_text[0..last_space]
|
||||
remaining_text = remaining_text[(last_space + 1)..]
|
||||
end
|
||||
else
|
||||
# No need for chunking
|
||||
yield title + "\n" + content
|
||||
end
|
||||
end
|
||||
|
||||
def body_text
|
||||
html_to_text(attributes[self.class.smart_search_body_column])
|
||||
end
|
||||
|
||||
def delete_embeddings
|
||||
return unless ActiveRecord::Base.connection.table_exists?(self.class.embedding_class.table_name)
|
||||
|
||||
# TODO: delete via the association once pgvector is available everywhere
|
||||
# (without :dependent, that would try to nullify the fk in violation of the constraint
|
||||
# but with :dependent, instances without pgvector would try to access the nonexistent table when a page is deleted)
|
||||
shard.activate do
|
||||
self.class.embedding_class.where(self.class.embedding_foreign_key => self).delete_all
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,82 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
#
|
||||
# Copyright (C) 2024 - present Instructure, Inc.
|
||||
#
|
||||
# This file is part of Canvas.
|
||||
#
|
||||
# Canvas is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, version 3 of the License.
|
||||
#
|
||||
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
||||
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
require_relative "../spec_helper"
|
||||
|
||||
describe SmartSearchable do
|
||||
describe "#generate_embeddings" do
|
||||
before do
|
||||
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
|
||||
|
||||
allow(SmartSearch).to receive(:generate_embedding).and_return([1] * 1536)
|
||||
expect(SmartSearch).to receive(:api_key).at_least(:once).and_return("fake_api_key")
|
||||
end
|
||||
|
||||
before :once do
|
||||
course_factory
|
||||
@course.enable_feature! :smart_search
|
||||
end
|
||||
|
||||
it "generates an embedding when creating a page" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
expect(@page.reload.embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "replaces an embedding if it already exists" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
@page.update body: "bar"
|
||||
run_jobs
|
||||
expect(@page.reload.embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "strips HTML from the body before indexing" do
|
||||
wiki_page_model(title: "test", body: "<ul><li>foo</li></ul>")
|
||||
expect(SmartSearch).to receive(:generate_embedding).with("test\n* foo")
|
||||
run_jobs
|
||||
end
|
||||
|
||||
it "deletes embeddings when a page is deleted (and regenerates them when undeleted)" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
@page.destroy
|
||||
expect(@page.reload.embeddings.count).to eq 0
|
||||
|
||||
@page.restore
|
||||
run_jobs
|
||||
expect(@page.reload.embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "generates multiple embeddings for a page with long content" do
|
||||
wiki_page_model(title: "test", body: "foo" * 2000)
|
||||
run_jobs
|
||||
expect(@page.reload.embeddings.count).to eq 2
|
||||
end
|
||||
|
||||
it "generates multiple embeddings and doesn't split words" do
|
||||
# 7997 bytes in total, would fit into two 4000-byte pages,
|
||||
# but word splitting will push it into 3
|
||||
wiki_page_model(title: "test", body: "testing123 " * 727)
|
||||
run_jobs
|
||||
expect(@page.reload.embeddings.count).to eq 3
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1098,63 +1098,4 @@ describe WikiPage do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "#generate_embeddings" do
|
||||
before do
|
||||
skip "not available" unless ActiveRecord::Base.connection.table_exists?("wiki_page_embeddings")
|
||||
|
||||
allow(SmartSearch).to receive(:generate_embedding).and_return([1] * 1536)
|
||||
expect(SmartSearch).to receive(:api_key).at_least(:once).and_return("fake_api_key")
|
||||
end
|
||||
|
||||
before :once do
|
||||
course_factory
|
||||
@course.enable_feature! :smart_search
|
||||
end
|
||||
|
||||
it "generates an embedding when creating a page" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "replaces an embedding if it already exists" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
@page.update body: "bar"
|
||||
run_jobs
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "strips HTML from the body before indexing" do
|
||||
wiki_page_model(title: "test", body: "<ul><li>foo</li></ul>")
|
||||
expect(SmartSearch).to receive(:generate_embedding).with("test\n* foo")
|
||||
run_jobs
|
||||
end
|
||||
|
||||
it "deletes embeddings when a page is deleted (and regenerates them when undeleted)" do
|
||||
wiki_page_model(title: "test", body: "foo")
|
||||
run_jobs
|
||||
@page.destroy
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 0
|
||||
|
||||
@page.restore
|
||||
run_jobs
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 1
|
||||
end
|
||||
|
||||
it "generates multiple embeddings for a page with long content" do
|
||||
wiki_page_model(title: "test", body: "foo" * 2000)
|
||||
run_jobs
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 2
|
||||
end
|
||||
|
||||
it "generates multiple embeddings and doesn't split words" do
|
||||
# 7997 bytes in total, would fit into two 4000-byte pages,
|
||||
# but word splitting will push it into 3
|
||||
wiki_page_model(title: "test", body: "testing123 " * 727)
|
||||
run_jobs
|
||||
expect(@page.reload.wiki_page_embeddings.count).to eq 3
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue