2020-10-27 00:50:13 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-04-28 04:01:09 +08:00
|
|
|
#
|
|
|
|
# Copyright (C) 2014 - present Instructure, Inc.
|
|
|
|
#
|
|
|
|
# This file is part of Canvas.
|
|
|
|
#
|
|
|
|
# Canvas is free software: you can redistribute it and/or modify it under
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free
|
|
|
|
# Software Foundation, version 3 of the License.
|
|
|
|
#
|
|
|
|
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
|
|
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
|
|
# details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License along
|
|
|
|
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2015-04-09 01:21:08 +08:00
|
|
|
require "nokogiri"
|
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
class CourseLinkValidator
|
|
|
|
TAG = "link_validation"
|
|
|
|
|
|
|
|
# retrieves the validation job
|
|
|
|
def self.current_progress(course)
|
|
|
|
Progress.where(tag: TAG, context_type: "Course", context_id: course.id).last
|
|
|
|
end
|
|
|
|
|
|
|
|
# creates a new validation job
|
|
|
|
def self.queue_course(course)
|
|
|
|
progress = current_progress(course)
|
|
|
|
return progress if progress&.pending?
|
|
|
|
|
|
|
|
progress ||= Progress.new(tag: TAG, context: course)
|
|
|
|
progress.reset!
|
2020-10-23 04:58:48 +08:00
|
|
|
progress.process_job(self, :process, {})
|
2014-10-16 03:10:46 +08:00
|
|
|
progress
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.process(progress)
|
|
|
|
validator = new(progress.context)
|
|
|
|
validator.check_course(progress)
|
2018-11-28 04:49:16 +08:00
|
|
|
progress.set_results({ issues: validator.issues, completed_at: Time.now.utc, version: 2 })
|
2014-10-16 03:10:46 +08:00
|
|
|
rescue
|
2015-04-05 10:39:49 +08:00
|
|
|
report_id = Canvas::Errors.capture_exception(:course_link_validation, $ERROR_INFO)[:error_report]
|
2014-10-16 03:10:46 +08:00
|
|
|
progress.workflow_state = "failed"
|
2015-04-05 10:39:49 +08:00
|
|
|
progress.set_results({ error_report_id: report_id, completed_at: Time.now.utc })
|
2014-10-16 03:10:46 +08:00
|
|
|
end
|
|
|
|
|
2015-12-05 03:25:51 +08:00
|
|
|
attr_accessor :course, :domain_regex, :issues, :visited_urls
|
2014-10-16 03:10:46 +08:00
|
|
|
|
|
|
|
def initialize(course)
|
|
|
|
self.course = course
|
2015-12-05 03:25:51 +08:00
|
|
|
domain = course.root_account.domain
|
|
|
|
self.domain_regex = %r{\w+:?//#{domain}/} if domain
|
2014-10-16 03:10:46 +08:00
|
|
|
self.issues = []
|
|
|
|
self.visited_urls = {}
|
|
|
|
end
|
|
|
|
|
|
|
|
# ****************************************************************
|
|
|
|
# this is where the magic happens
|
|
|
|
def check_course(progress)
|
2018-11-28 04:49:16 +08:00
|
|
|
# Course card image
|
|
|
|
if course.image_url.present?
|
|
|
|
find_invalid_link(course.image_url) do |link|
|
|
|
|
issues << { name: I18n.t("Course Card Image"), type: :course_card_image,
|
|
|
|
content_url: "/courses/#{course.id}/settings",
|
|
|
|
invalid_links: [link.merge(image: true)] }
|
|
|
|
end
|
|
|
|
progress.update_completion! 1
|
|
|
|
end
|
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
# Syllabus
|
|
|
|
find_invalid_links(course.syllabus_body) do |links|
|
|
|
|
issues << { name: I18n.t(:syllabus, "Course Syllabus"), type: :syllabus,
|
|
|
|
content_url: "/courses/#{course.id}/assignments/syllabus" }.merge(invalid_links: links)
|
|
|
|
end
|
|
|
|
progress.update_completion! 5
|
|
|
|
|
|
|
|
# Assessment questions
|
|
|
|
course.assessment_questions.active.each do |aq|
|
2016-02-22 22:33:10 +08:00
|
|
|
next if aq.assessment_question_bank.deleted?
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
check_question(aq)
|
|
|
|
end
|
|
|
|
progress.update_completion! 15
|
|
|
|
|
|
|
|
# Assignments
|
|
|
|
course.assignments.active.each do |assignment|
|
2015-07-10 04:33:36 +08:00
|
|
|
next if assignment.quiz || assignment.discussion_topic
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
find_invalid_links(assignment.description) do |links|
|
|
|
|
issues << { name: assignment.title, type: :assignment,
|
|
|
|
content_url: "/courses/#{course.id}/assignments/#{assignment.id}" }.merge(invalid_links: links)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
progress.update_completion! 25
|
|
|
|
|
|
|
|
# Calendar events
|
|
|
|
course.calendar_events.active.each do |event|
|
|
|
|
find_invalid_links(event.description) do |links|
|
|
|
|
issues << { name: event.title, type: :calendar_event,
|
|
|
|
content_url: "/courses/#{course.id}/calendar_events/#{event.id}" }.merge(invalid_links: links)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
progress.update_completion! 35
|
|
|
|
|
|
|
|
# Discussion topics
|
|
|
|
course.discussion_topics.active.each do |topic|
|
|
|
|
find_invalid_links(topic.message) do |links|
|
|
|
|
issues << { name: topic.title, type: :discussion_topic,
|
|
|
|
content_url: "/courses/#{course.id}/discussion_topics/#{topic.id}" }.merge(invalid_links: links)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
progress.update_completion! 55
|
|
|
|
|
|
|
|
# External URL Module items (almost forgot about these)
|
2018-11-28 04:49:16 +08:00
|
|
|
invalid_module_links = {}
|
|
|
|
course.context_module_tags.not_deleted.where(content_type: "ExternalUrl").preload(:context_module).each do |ct|
|
2014-10-16 03:10:46 +08:00
|
|
|
find_invalid_link(ct.url) do |invalid_link|
|
2018-11-28 04:49:16 +08:00
|
|
|
(invalid_module_links[ct.context_module] ||= []) << invalid_link.merge(link_text: ct.title)
|
2014-10-16 03:10:46 +08:00
|
|
|
end
|
|
|
|
end
|
2021-11-04 05:36:34 +08:00
|
|
|
invalid_module_links.each do |mod, links|
|
2018-11-28 04:49:16 +08:00
|
|
|
issues << { name: mod.name, type: :module,
|
2021-11-04 05:36:34 +08:00
|
|
|
content_url: "/courses/#{course.id}/modules#module_#{mod.id}" }.merge(invalid_links: links)
|
2018-11-28 04:49:16 +08:00
|
|
|
end
|
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
progress.update_completion! 65
|
|
|
|
|
|
|
|
# Quizzes
|
|
|
|
course.quizzes.active.each do |quiz|
|
|
|
|
find_invalid_links(quiz.description) do |links|
|
|
|
|
issues << { name: quiz.title, type: :quiz,
|
|
|
|
content_url: "/courses/#{course.id}/quizzes/#{quiz.id}" }.merge(invalid_links: links)
|
|
|
|
end
|
2018-01-22 22:31:35 +08:00
|
|
|
quiz.quiz_questions.active.each do |qq|
|
2014-10-16 03:10:46 +08:00
|
|
|
check_question(qq)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
progress.update_completion! 85
|
|
|
|
|
|
|
|
# Wiki pages
|
2017-08-08 01:28:58 +08:00
|
|
|
course.wiki_pages.not_deleted.each do |page|
|
2014-10-16 03:10:46 +08:00
|
|
|
find_invalid_links(page.body) do |links|
|
|
|
|
issues << { name: page.title, type: :wiki_page,
|
|
|
|
content_url: "/courses/#{course.id}/pages/#{page.url}" }.merge(invalid_links: links)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
progress.update_completion! 99
|
|
|
|
end
|
|
|
|
|
|
|
|
def check_question(question)
|
|
|
|
# Assessment/Quiz Questions
|
|
|
|
|
|
|
|
links = []
|
|
|
|
%i[question_text correct_comments_html incorrect_comments_html neutral_comments_html more_comments_html].each do |field|
|
|
|
|
find_invalid_links(question.question_data[field]) do |field_links|
|
|
|
|
links += field_links
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-10-23 03:50:42 +08:00
|
|
|
(question.question_data[:answers] || []).each do |answer|
|
2014-10-16 03:10:46 +08:00
|
|
|
%i[html comments_html left_html].each do |field|
|
|
|
|
find_invalid_links(answer[field]) do |field_links|
|
|
|
|
links += field_links
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if links.any?
|
|
|
|
hash = { name: question.question_data[:question_name] }.merge(invalid_links: links)
|
|
|
|
case question
|
|
|
|
when AssessmentQuestion
|
|
|
|
hash[:type] = :assessment_question
|
|
|
|
hash[:content_url] = "/courses/#{course.id}/question_banks/#{question.assessment_question_bank_id}#question_#{question.id}_question_text"
|
|
|
|
when Quizzes::QuizQuestion
|
|
|
|
hash[:type] = :quiz_question
|
|
|
|
hash[:content_url] = "/courses/#{course.id}/quizzes/#{question.quiz_id}/take?preview=1#question_#{question.id}"
|
|
|
|
end
|
|
|
|
issues << hash
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# pretty much copied from ImportedHtmlConverter
|
|
|
|
def find_invalid_links(html)
|
|
|
|
links = []
|
2021-01-12 02:24:13 +08:00
|
|
|
doc = Nokogiri::HTML5(html || "")
|
2019-02-05 06:40:36 +08:00
|
|
|
attrs = %w[href src data value]
|
2014-10-16 03:10:46 +08:00
|
|
|
|
|
|
|
doc.search("*").each do |node|
|
|
|
|
attrs.each do |attr|
|
|
|
|
url = node[attr]
|
|
|
|
next unless url.present?
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
if attr == "value" && !(node["name"] && node["name"] == "src")
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
find_invalid_link(url) do |invalid_link|
|
2018-11-28 04:49:16 +08:00
|
|
|
link_text = node.text.presence
|
|
|
|
invalid_link[:link_text] = link_text if link_text
|
|
|
|
invalid_link[:image] = true if node.name == "img"
|
2014-10-16 03:10:46 +08:00
|
|
|
links << invalid_link
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
yield links if links.any?
|
|
|
|
end
|
|
|
|
|
|
|
|
# yields a hash containing the url and an error type if the url is invalid
|
|
|
|
def find_invalid_link(url)
|
2018-12-31 23:14:02 +08:00
|
|
|
return if url.start_with?("mailto:")
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2021-09-29 08:07:43 +08:00
|
|
|
unless (result = visited_urls[url])
|
2014-10-16 03:10:46 +08:00
|
|
|
begin
|
2015-12-05 03:25:51 +08:00
|
|
|
if ImportedHtmlConverter.relative_url?(url) || (domain_regex && url.match(domain_regex))
|
2016-01-07 23:32:27 +08:00
|
|
|
result = if valid_route?(url)
|
|
|
|
if url.match(%r{/courses/(\d+)}) && course.id.to_s != $1
|
|
|
|
:course_mismatch
|
|
|
|
else
|
|
|
|
check_object_status(url)
|
|
|
|
end
|
2015-12-10 22:47:19 +08:00
|
|
|
else
|
2016-01-07 23:32:27 +08:00
|
|
|
:unreachable
|
2014-10-16 03:10:46 +08:00
|
|
|
end
|
2018-12-31 23:14:02 +08:00
|
|
|
else
|
2014-10-16 03:10:46 +08:00
|
|
|
unless reachable_url?(url)
|
|
|
|
result = :unreachable
|
|
|
|
end
|
|
|
|
end
|
2015-07-24 23:32:11 +08:00
|
|
|
rescue URI::Error
|
2014-10-16 03:10:46 +08:00
|
|
|
result = :unparsable
|
|
|
|
end
|
|
|
|
result ||= :success
|
|
|
|
visited_urls[url] = result
|
|
|
|
end
|
|
|
|
|
|
|
|
unless result == :success
|
|
|
|
invalid_link = { url: url, reason: result }
|
|
|
|
yield invalid_link
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-01-07 23:32:27 +08:00
|
|
|
# checks against the Rails routes to see if the url matches anything
|
|
|
|
def valid_route?(url)
|
|
|
|
path = URI.parse(url).path
|
2021-12-17 03:29:12 +08:00
|
|
|
path = ActionDispatch::Journey::Router::Utils.normalize_path(path)
|
2016-01-07 23:32:27 +08:00
|
|
|
|
|
|
|
@route_set ||= ::Rails.application.routes.set.routes.select { |r| r.verb == "GET" }
|
2021-12-17 03:29:12 +08:00
|
|
|
@route_set.any? { |r| r.path.match(path) } || (!Pathname(path).each_filename.include?("..") && Rails.root.join("public", path.delete_prefix("/")).file?)
|
2016-01-07 23:32:27 +08:00
|
|
|
end
|
|
|
|
|
2015-12-10 22:47:19 +08:00
|
|
|
# makes sure that links to course objects exist and are in a visible state
|
2019-01-19 05:11:28 +08:00
|
|
|
def check_object_status(url, object: nil)
|
|
|
|
return :missing_item unless valid_route?(url)
|
2021-11-11 03:36:19 +08:00
|
|
|
return :missing_item if url.include?("/test_error")
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2019-01-19 05:11:28 +08:00
|
|
|
object ||= Context.find_asset_by_url(url)
|
|
|
|
unless object
|
|
|
|
return :missing_item unless [nil, "syllabus"].include?(url.match(%r{/courses/\d+/\w+/(.+)})&.[](1))
|
2021-11-11 03:36:19 +08:00
|
|
|
return :missing_item if url.include?("/media_objects_iframe/")
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2019-01-19 05:11:28 +08:00
|
|
|
return nil
|
|
|
|
end
|
|
|
|
if object.deleted?
|
|
|
|
return :deleted
|
|
|
|
end
|
|
|
|
|
|
|
|
case object
|
|
|
|
when Attachment
|
|
|
|
return :unpublished_item if object.locked?
|
|
|
|
when Quizzes::Quiz
|
|
|
|
return :unpublished_item if object.workflow_state == "created" || object.workflow_state == "unpublished"
|
|
|
|
else
|
|
|
|
return :unpublished_item if object.workflow_state == "unpublished"
|
2015-12-10 22:47:19 +08:00
|
|
|
end
|
2019-01-19 05:11:28 +08:00
|
|
|
nil
|
2021-10-20 05:23:50 +08:00
|
|
|
rescue
|
2019-01-19 05:11:28 +08:00
|
|
|
:missing_item
|
2015-12-10 22:47:19 +08:00
|
|
|
end
|
|
|
|
|
2019-12-17 00:30:05 +08:00
|
|
|
# whitelisted hosts will never be flagged as unavailable
|
|
|
|
def whitelisted?(url)
|
|
|
|
@whitelist ||= Setting.get("link_validator_whitelisted_hosts", "").split(",")
|
|
|
|
return false if @whitelist.empty?
|
2021-09-23 00:25:11 +08:00
|
|
|
|
2019-12-17 00:30:05 +08:00
|
|
|
host = URI.parse(url).host
|
|
|
|
@whitelist.include?(host)
|
|
|
|
rescue URI::InvalidURIError
|
|
|
|
false
|
|
|
|
end
|
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
# ping the url and make sure we get a 200
|
|
|
|
def reachable_url?(url)
|
2019-12-17 00:30:05 +08:00
|
|
|
return true if whitelisted?(url)
|
|
|
|
|
2018-12-09 06:07:25 +08:00
|
|
|
@unavailable_photo_redirect_pattern ||= Regexp.new(Setting.get("unavailable_photo_redirect_pattern", "yimg\\.com/.+/photo_unavailable.png$"))
|
2018-11-28 04:49:16 +08:00
|
|
|
redirect_proc = lambda do |response|
|
|
|
|
# flickr does a redirect to this file when a photo is deleted/not found;
|
|
|
|
# treat this as a broken image instead of following the redirect
|
|
|
|
url = response["Location"]
|
2021-11-12 01:20:36 +08:00
|
|
|
raise RuntimeError("photo unavailable") if url&.match?(@unavailable_photo_redirect_pattern)
|
2018-11-28 04:49:16 +08:00
|
|
|
end
|
|
|
|
|
2014-10-16 03:10:46 +08:00
|
|
|
begin
|
2018-11-28 04:49:16 +08:00
|
|
|
response = CanvasHttp.head(url, { "Accept-Encoding" => "gzip" }, redirect_limit: 9, redirect_spy: redirect_proc)
|
2016-04-29 01:01:07 +08:00
|
|
|
if %w[404 405].include?(response.code)
|
2018-11-28 04:49:16 +08:00
|
|
|
response = CanvasHttp.get(url, { "Accept-Encoding" => "gzip" }, redirect_limit: 9, redirect_spy: redirect_proc) do
|
|
|
|
# don't read the response body
|
|
|
|
end
|
2016-04-29 01:01:07 +08:00
|
|
|
end
|
2015-12-03 22:37:34 +08:00
|
|
|
|
|
|
|
case response.code
|
2021-11-04 01:53:13 +08:00
|
|
|
when /^2/, "401", "403", "429", "503"
|
2015-12-03 22:37:34 +08:00
|
|
|
# we accept unauthorized and forbidden codes here because sometimes servers refuse to serve our requests
|
|
|
|
# and someone can link to a site that requires authentication anyway - doesn't necessarily make it invalid
|
|
|
|
true
|
|
|
|
else
|
|
|
|
false
|
|
|
|
end
|
2014-10-16 03:10:46 +08:00
|
|
|
rescue
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
2015-04-05 10:39:49 +08:00
|
|
|
end
|