canvas-lms/lib/course_link_validator.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

329 lines
11 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
#
# Copyright (C) 2014 - present Instructure, Inc.
#
# This file is part of Canvas.
#
# Canvas is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, version 3 of the License.
#
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
require "nokogiri"
class CourseLinkValidator
TAG = "link_validation"
# retrieves the validation job
def self.current_progress(course)
Progress.where(tag: TAG, context_type: "Course", context_id: course.id).last
end
# creates a new validation job
def self.queue_course(course)
progress = current_progress(course)
return progress if progress&.pending?
progress ||= Progress.new(tag: TAG, context: course)
progress.reset!
progress.process_job(self, :process, {})
progress
end
def self.process(progress)
validator = new(progress.context)
validator.check_course(progress)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
progress.set_results({ issues: validator.issues, completed_at: Time.now.utc, version: 2 })
rescue
report_id = Canvas::Errors.capture_exception(:course_link_validation, $ERROR_INFO)[:error_report]
progress.workflow_state = "failed"
progress.set_results({ error_report_id: report_id, completed_at: Time.now.utc })
end
attr_accessor :course, :domain_regex, :issues, :visited_urls
def initialize(course)
self.course = course
domain = course.root_account.domain
self.domain_regex = %r{\w+:?//#{domain}/} if domain
self.issues = []
self.visited_urls = {}
end
# ****************************************************************
# this is where the magic happens
def check_course(progress)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
# Course card image
if course.image_url.present?
find_invalid_link(course.image_url) do |link|
issues << { name: I18n.t("Course Card Image"), type: :course_card_image,
content_url: "/courses/#{course.id}/settings",
invalid_links: [link.merge(image: true)] }
end
progress.update_completion! 1
end
# Syllabus
find_invalid_links(course.syllabus_body) do |links|
issues << { name: I18n.t(:syllabus, "Course Syllabus"), type: :syllabus,
content_url: "/courses/#{course.id}/assignments/syllabus" }.merge(invalid_links: links)
end
progress.update_completion! 5
# Assessment questions
course.assessment_questions.active.each do |aq|
next if aq.assessment_question_bank.deleted?
check_question(aq)
end
progress.update_completion! 15
# Assignments
course.assignments.active.each do |assignment|
next if assignment.quiz || assignment.discussion_topic
find_invalid_links(assignment.description) do |links|
issues << { name: assignment.title, type: :assignment,
content_url: "/courses/#{course.id}/assignments/#{assignment.id}" }.merge(invalid_links: links)
end
end
progress.update_completion! 25
# Calendar events
course.calendar_events.active.each do |event|
find_invalid_links(event.description) do |links|
issues << { name: event.title, type: :calendar_event,
content_url: "/courses/#{course.id}/calendar_events/#{event.id}" }.merge(invalid_links: links)
end
end
progress.update_completion! 35
# Discussion topics
course.discussion_topics.active.each do |topic|
find_invalid_links(topic.message) do |links|
issues << { name: topic.title, type: :discussion_topic,
content_url: "/courses/#{course.id}/discussion_topics/#{topic.id}" }.merge(invalid_links: links)
end
end
progress.update_completion! 55
# External URL Module items (almost forgot about these)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
invalid_module_links = {}
course.context_module_tags.not_deleted.where(content_type: "ExternalUrl").preload(:context_module).each do |ct|
find_invalid_link(ct.url) do |invalid_link|
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
(invalid_module_links[ct.context_module] ||= []) << invalid_link.merge(link_text: ct.title)
end
end
invalid_module_links.each do |mod, links|
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
issues << { name: mod.name, type: :module,
content_url: "/courses/#{course.id}/modules#module_#{mod.id}" }.merge(invalid_links: links)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
end
progress.update_completion! 65
# Quizzes
course.quizzes.active.each do |quiz|
find_invalid_links(quiz.description) do |links|
issues << { name: quiz.title, type: :quiz,
content_url: "/courses/#{course.id}/quizzes/#{quiz.id}" }.merge(invalid_links: links)
end
quiz.quiz_questions.active.each do |qq|
check_question(qq)
end
end
progress.update_completion! 85
# Wiki pages
course.wiki_pages.not_deleted.each do |page|
find_invalid_links(page.body) do |links|
issues << { name: page.title, type: :wiki_page,
content_url: "/courses/#{course.id}/pages/#{page.url}" }.merge(invalid_links: links)
end
end
progress.update_completion! 99
end
def check_question(question)
# Assessment/Quiz Questions
links = []
%i[question_text correct_comments_html incorrect_comments_html neutral_comments_html more_comments_html].each do |field|
find_invalid_links(question.question_data[field]) do |field_links|
links += field_links
end
end
(question.question_data[:answers] || []).each do |answer|
%i[html comments_html left_html].each do |field|
find_invalid_links(answer[field]) do |field_links|
links += field_links
end
end
end
if links.any?
hash = { name: question.question_data[:question_name] }.merge(invalid_links: links)
case question
when AssessmentQuestion
hash[:type] = :assessment_question
hash[:content_url] = "/courses/#{course.id}/question_banks/#{question.assessment_question_bank_id}#question_#{question.id}_question_text"
when Quizzes::QuizQuestion
hash[:type] = :quiz_question
hash[:content_url] = "/courses/#{course.id}/quizzes/#{question.quiz_id}/take?preview=1#question_#{question.id}"
end
issues << hash
end
end
# pretty much copied from ImportedHtmlConverter
def find_invalid_links(html)
links = []
doc = Nokogiri::HTML5(html || "")
attrs = %w[href src data value]
doc.search("*").each do |node|
attrs.each do |attr|
url = node[attr]
next unless url.present?
if attr == "value" && !(node["name"] && node["name"] == "src")
next
end
find_invalid_link(url) do |invalid_link|
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
link_text = node.text.presence
invalid_link[:link_text] = link_text if link_text
invalid_link[:image] = true if node.name == "img"
links << invalid_link
end
end
end
yield links if links.any?
end
# yields a hash containing the url and an error type if the url is invalid
def find_invalid_link(url)
return if url.start_with?("mailto:")
unless (result = visited_urls[url])
begin
if ImportedHtmlConverter.relative_url?(url) || (domain_regex && url.match(domain_regex))
result = if valid_route?(url)
if url.match(%r{/courses/(\d+)}) && course.id.to_s != $1
:course_mismatch
else
check_object_status(url)
end
else
:unreachable
end
else
unless reachable_url?(url)
result = :unreachable
end
end
rescue URI::Error
result = :unparsable
end
result ||= :success
visited_urls[url] = result
end
unless result == :success
invalid_link = { url: url, reason: result }
yield invalid_link
end
end
# checks against the Rails routes to see if the url matches anything
def valid_route?(url)
path = URI.parse(url).path
path = ActionDispatch::Journey::Router::Utils.normalize_path(path)
@route_set ||= ::Rails.application.routes.set.routes.select { |r| r.verb == "GET" }
@route_set.any? { |r| r.path.match(path) } || (!Pathname(path).each_filename.include?("..") && Rails.root.join("public", path.delete_prefix("/")).file?)
end
# makes sure that links to course objects exist and are in a visible state
def check_object_status(url, object: nil)
return :missing_item unless valid_route?(url)
return :missing_item if url.include?("/test_error")
object ||= Context.find_asset_by_url(url)
unless object
return :missing_item unless [nil, "syllabus"].include?(url.match(%r{/courses/\d+/\w+/(.+)})&.[](1))
return :missing_item if url.include?("/media_objects_iframe/")
return nil
end
if object.deleted?
return :deleted
end
case object
when Attachment
return :unpublished_item if object.locked?
when Quizzes::Quiz
return :unpublished_item if object.workflow_state == "created" || object.workflow_state == "unpublished"
else
return :unpublished_item if object.workflow_state == "unpublished"
end
nil
rescue
:missing_item
end
# whitelisted hosts will never be flagged as unavailable
def whitelisted?(url)
@whitelist ||= Setting.get("link_validator_whitelisted_hosts", "").split(",")
return false if @whitelist.empty?
host = URI.parse(url).host
@whitelist.include?(host)
rescue URI::InvalidURIError
false
end
# ping the url and make sure we get a 200
def reachable_url?(url)
return true if whitelisted?(url)
@unavailable_photo_redirect_pattern ||= Regexp.new(Setting.get("unavailable_photo_redirect_pattern", "yimg\\.com/.+/photo_unavailable.png$"))
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
redirect_proc = lambda do |response|
# flickr does a redirect to this file when a photo is deleted/not found;
# treat this as a broken image instead of following the redirect
url = response["Location"]
raise RuntimeError("photo unavailable") if url&.match?(@unavailable_photo_redirect_pattern)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
end
begin
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
response = CanvasHttp.head(url, { "Accept-Encoding" => "gzip" }, redirect_limit: 9, redirect_spy: redirect_proc)
if %w[404 405].include?(response.code)
enhance course link validator this commit includes the following enhancements: * broken course card images are detected * deleted flickr images (redirects to image_unavailable.png) are treated as broken * the type and icon for each resource that has broken links are included in the list * broken links in each resource are grouped by the type of brokenness, with explanatory text * broken external URL items in modules are grouped by module * link text for bad links is shown, rather than just the URL test plan: - add the following to various objects in a course: - a flickr image (that isn't deleted or broken) - a flickr image where the name part of the URL has been changed (so it redirects to an "image_unavailable.png") - an image with an invalid hostname (so it is unreachable) - a broken image of some sort as the course card image - links to an unpublished item - some external URL module items in multiple modules with broken URLs - run the Course Link Validator (from Course Settings, right sidebar; ensure jobs are running) - make sure the good flickr image isn't reported as broken - make sure the broken flickr image is - make sure the results include the course card image - make sure the unpublished item appears when the "Show links to unpublished content" is checked - make sure the broken external URL module items appear under their corresponding modules closes ADMIN-2280 Change-Id: Ia30d128eb19244cca34f64dc716d7c6f5ef1cdf8 Reviewed-on: https://gerrit.instructure.com/173440 Reviewed-by: Carl Kibler <ckibler@instructure.com> Tested-by: Jenkins QA-Review: Anju Reddy <areddy@instructure.com> Product-Review: Christi Wruck
2018-11-28 04:49:16 +08:00
response = CanvasHttp.get(url, { "Accept-Encoding" => "gzip" }, redirect_limit: 9, redirect_spy: redirect_proc) do
# don't read the response body
end
end
case response.code
when /^2/, "401", "403", "429", "503"
# we accept unauthorized and forbidden codes here because sometimes servers refuse to serve our requests
# and someone can link to a site that requires authentication anyway - doesn't necessarily make it invalid
true
else
false
end
rescue
false
end
end
end