bump sanitize to latest

[fsc-max-nodes=12]
[fsc-timeout=60]

 * switch lots of parsing to Nokogumbo to keep things consistent
 * deep CSS sanitization is now built in, and with a proper parser (meaning
   we can drop our code to do it, and adjust some specs to account for things
   that _are_ valid)

lots of changes because gumbo parsing<->serialization cycle is slightly different:
 * better job preserving original whitespace
 * literal non-breaking space characters are converted to &nbsp; entities
 * <p> tags aren't inserted for the heck of it
 * several _other_ entities are unnecessary, and output as literal characters
 * some elements no longer have a closing tag

Change-Id: I7c5e36cbd04b8a05f64c9e0af00868dd6b00f4ce
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/256444
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
Reviewed-by: Jacob Burroughs <jburroughs@instructure.com>
QA-Review: Cody Cutrer <cody@instructure.com>
Product-Review: Cody Cutrer <cody@instructure.com>
This commit is contained in:
Cody Cutrer 2021-01-11 11:24:13 -07:00
parent 6aed7fbc6e
commit 160ff3c899
205 changed files with 528 additions and 597 deletions

View File

@ -95,6 +95,7 @@ gem 'mini_magick', '4.11.0'
gem 'multi_json', '1.15.0'
gem 'net-ldap', '0.16.3', require: false
gem 'nokogiri', '1.10.10', require: false
gem 'nokogumbo', '2.0.4'
gem 'oauth', '0.5.4', require: false
gem 'oauth2', '1.4.4', require: false
gem 'oj', '3.10.16'
@ -117,7 +118,7 @@ gem 'rubyzip', '2.3.0', require: 'zip'
gem 'safe_yaml', '1.0.5', require: false
gem 'saml2', '3.0.9'
gem 'nokogiri-xmlsec-instructure', '0.9.7', require: false
gem 'sanitize', '2.1.1', require: false
gem 'sanitize', '5.2.3', require: false
gem 'sentry-raven', '2.13.0', require: false
gem 'guardrail', '2.0.1'
gem 'simple_oauth', '0.3.1', require: false

View File

@ -26,7 +26,7 @@ module BrokenLinkHelper
record = Context.find_asset_by_url(from_url)
record ||= Context.get_front_wiki_page_for_course_from_url(from_url)
return false unless record
body = Nokogiri::HTML(Context.asset_body(record))
body = Nokogiri::HTML5(Context.asset_body(record))
anchor = body.css("a[href$='#{request.fullpath}']").text
return false if anchor.blank?

View File

@ -467,7 +467,7 @@ module QuizzesHelper
editable = hash_get(options, :editable)
res = user_content hash_get(question, :question_text)
index = 0
doc = Nokogiri::HTML.fragment(res)
doc = Nokogiri::HTML5.fragment(res)
selects = doc.css(".question_input")
selects.each do |s|
if answer_list && !answer_list.empty?
@ -486,9 +486,9 @@ module QuizzesHelper
else
# If existing answer is one of the options, replace it with a span
if (opt_tag = s.children.css("option[value='#{a}']").first)
s.replace(<<-HTML)
<span>#{opt_tag.content}</span>
HTML
span = doc.fragment("<span />").children.first
span.children = opt_tag.children
s.swap(span)
end
end

View File

@ -199,7 +199,7 @@ class DiscussionTopic::MaterializedView < ActiveRecord::Base
def self.include_mobile_overrides(entries, overrides)
entries.each do |entry|
if entry["message"]
parsed_html = Nokogiri::HTML::DocumentFragment.parse(entry["message"])
parsed_html = Nokogiri::HTML5.fragment(entry["message"])
Api::Html::Content.add_overrides_to_html(parsed_html, overrides)
entry["message"] = parsed_html.to_s
end

View File

@ -88,7 +88,7 @@ module Importers
# replace the entire node with a placeholder
result[:old_value] = node.to_xml
result[:placeholder] = placeholder(result[:old_value])
placeholder_node = Nokogiri::HTML::DocumentFragment.parse(result[:placeholder])
placeholder_node = Nokogiri::HTML5.fragment(result[:placeholder])
node.replace(placeholder_node)
else

View File

@ -80,7 +80,7 @@ module Importers
# see LinkParser for details
rel_path = link[:rel_path]
node = Nokogiri::HTML::DocumentFragment.parse(link[:old_value]).children.first
node = Nokogiri::HTML5.fragment(link[:old_value]).children.first
new_url = resolve_media_comment_data(node, rel_path)
new_url ||= resolve_relative_file_url(rel_path)

View File

@ -14,9 +14,11 @@ Gem::Specification.new do |spec|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
spec.require_paths = ["lib"]
spec.add_dependency "sanitize", "2.1.1"
# 4.6.3 breaks protocol checking on data-url attributes if all data attributes are allowed
spec.add_dependency "sanitize", "~> 5.2", ">= 5.2.3"
spec.add_development_dependency "bundler", "~> 1.5"
spec.add_development_dependency "byebug"
spec.add_development_dependency "rake"
spec.add_development_dependency "rspec", "~> 3.5.0"
end

View File

@ -42,56 +42,6 @@ module CanvasSanitize #:nodoc:
klass.extend(ClassMethods)
end
# modified from sanitize.rb to support mid-value matching
REGEX_STYLE_PROTOCOL = /([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|&#0*58|&#x0*3a)/i
REGEX_STYLE_METHOD = /([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\(|&#0*40|&#x0*28)/i
# used as a sanitize.rb transformer, below
def self.sanitize_style(env)
node = env[:node]
styles = []
style = node['style'] || ""
# taken from https://github.com/flavorjones/loofah/blob/master/lib/loofah/html5/scrub.rb
# the gauntlet
style = '' unless style =~ /\A([-:,\;#%.\(\)\/\sa-zA-Z0-9!]|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*\z/
style = '' unless style =~ /\A\s*([-\w]+\s*:[^\;]*(\;\s*|$))*\z/
config = env[:config]
style.scan(/([-\w]+)\s*:\s*([^;]*)/) do |property, value|
property = property.downcase
valid = (config[:style_properties] || []).include?(property)
valid ||= (config[:style_expressions] || []).any? { |e| property.match(e) }
if valid
styles << [property, clean_style_value(config, value)]
end
end
node['style'] = styles.select { |k, v| v }.map { |k, v| "#{k}: #{v}" }.join('; ') + ";"
end
def self.clean_style_value(config, value)
# checks for any colons anywhere in the string
# to make sure they're preceded by a valid protocol
protocols = config[:protocols]['style']['any']
# no idea what these are called in css, but it's
# a name followed by open-paren
# (i.e. url(...) or expression(...))
methods = config[:style_methods]
if methods
value.to_s.downcase.scan(REGEX_STYLE_METHOD) do |match|
return nil if !methods.include?(match[0].downcase)
end
end
if protocols
value.to_s.downcase.scan(REGEX_STYLE_PROTOCOL) do |match|
return nil if !protocols.include?(match[0].downcase)
end
end
value
end
DEFAULT_PROTOCOLS = ['http', 'https', :relative].freeze
SANITIZE = {
:elements => [
@ -99,7 +49,7 @@ module CanvasSanitize #:nodoc:
'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'del', 'ins', 'iframe', 'font',
'colgroup', 'dd', 'div', 'dl', 'dt', 'em', 'figure', 'figcaption', 'i', 'img', 'li', 'ol', 'p', 'pre',
'q', 'small', 'source', 'span', 'strike', 'strong', 'sub', 'sup', 'abbr', 'table', 'tbody', 'td',
'q', 'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'sup', 'abbr', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'u', 'ul', 'object', 'embed', 'param', 'video', 'track', 'audio',
# added to unify tinymce and canvas_sanitize whitelists
'address', 'acronym', 'map', 'area','bdo', 'dfn', 'kbd', 'legend', 'samp', 'tt', 'var', 'big',
@ -339,8 +289,8 @@ module CanvasSanitize #:nodoc:
'none' => { 'href' => DEFAULT_PROTOCOLS }.freeze,
'semantics' => { 'href' => DEFAULT_PROTOCOLS }.freeze,
}.freeze,
:style_methods => ['url'].freeze,
:style_properties => [
css: {
properties: ([
'background', 'border', 'clear', 'color',
'cursor', 'direction', 'display', 'float',
'font', 'height', 'left', 'line-height',
@ -353,20 +303,17 @@ module CanvasSanitize #:nodoc:
'top', 'vertical-align',
'visibility', 'white-space', 'width',
'z-index', 'zoom'
].freeze,
:style_expressions => [
/\Abackground-(?:attachment|color|image|position|repeat)\z/,
/\Abackground-position-(?:x|y)\z/,
/\Aborder-(?:bottom|collapse|color|left|right|spacing|style|top|width)\z/,
/\Aborder-(?:bottom|left|right|top)-(?:color|style|width)\z/,
/\Afont-(?:family|size|stretch|style|variant|weight)\z/,
/\Alist-style-(?:image|position|type)\z/,
/\Amargin-(?:bottom|left|right|top|offset)\z/,
/\Apadding-(?:bottom|left|right|top)\z/
].freeze,
:transformers => lambda { |env|
CanvasSanitize.sanitize_style(env) if env[:node]['style']
Sanitize.clean_node!(env[:node], {:remove_contents => true}) if env[:node_name] == 'style'
] +
%w{attachment color image position repeat}.map { |i| "background-#{i}"} +
%w{x y}.map { |i| "background-position-#{i}" } +
%w{bottom collapse color left right spacing style top width}.map { |i| "border-#{i}" } +
%w{bottom left right top}.map { |i| %w{color style width}.map { |j| "border-#{i}-#{j}" } }.flatten +
%w{family size stretch style variant width}.map { |i| "font-#{i}" } +
%w{image position type}.map { |i| "list-style-#{i}" } +
%w{bottom left right top offset}.map { |i| "margin-#{i}" } +
%w{bottom left right top}.map { |i| "padding-#{i}" }
).to_set.freeze,
protocols: DEFAULT_PROTOCOLS
}
}.freeze

View File

@ -19,6 +19,7 @@
#
require 'spec_helper'
require 'timeout'
describe CanvasSanitize do
describe "#clean" do
@ -56,7 +57,7 @@ describe CanvasSanitize do
it "does not strip track elements" do
cleaned = Sanitize.clean("<track src=\"http://google.com\"></track>", CanvasSanitize::SANITIZE)
expect(cleaned).to eq("<track src=\"http://google.com\"></track>")
expect(cleaned).to eq("<track src=\"http://google.com\">")
end
it "sanitizes javascript protocol in mathml" do
@ -78,4 +79,106 @@ describe CanvasSanitize do
cleaned = Sanitize.clean("<a data-item-href=\"javascript:alert('bad')\">Link</a>", CanvasSanitize::SANITIZE)
expect(cleaned).to eq("<a>Link</a>")
end
it "should sanitize style attributes width invalid url protocols" do
str = "<div style='width: 200px; background: url(httpx://www.google.com) no-repeat left center; height: 10px;'></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/background/)
expect(res).not_to match(/google/)
expect(res).to match(/width/)
expect(res).to match(/height/)
end
it "handles some tricky urls" do
str = "<div style=\"width: 200px; background:url('java\nscript:alert(1)'); height: 10px;\"></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/background/)
expect(res).not_to match(/alert/)
expect(res).not_to match(/height/)
expect(res).to match(/width/)
str = "<div style=\"width: 200px; background:url('javascript\n:alert(1)'); height: 10px;\"></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/background/)
expect(res).not_to match(/alert/)
expect(res).not_to match(/height/)
expect(res).to match(/width/)
str = "<div style=\"width: 200px; background:url('&#106;avascript:alert(5)'); height: 10px;\"></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/background/)
expect(res).not_to match(/alert/)
expect(res).to match(/height/)
expect(res).to match(/width/)
end
it "should sanitize style attributes with invalid methods" do
str = "<div style=\"width: 200px; background:expression(); height: 10px;\"></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/background/)
expect(res).not_to match(/\(/)
expect(res).to match(/height/)
expect(res).to match(/width/)
end
it "should allow negative values" do
str = "<div style='margin: -18px;height: 10px;'></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to match(/margin/)
expect(res).to match(/height/)
end
it "should remove non-whitelisted css attributes" do
str = "<div style='bacon: 5px; border-left-color: #fff;'></div>"
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to match(/border-left-color/)
expect(res).not_to match(/bacon/)
end
it "should allow valid css methods with valid css protocols" do
str = %{<div style="width: 200px; background: url(http://www.google.com) no-repeat left center; height: 10px;"></div>}
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to eq str
end
it "should allow font tags with valid attributes" do
str = %{<font face="Comic Sans MS" color="blue" size="3" bacon="yes">hello</font>}
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to eq %{<font face="Comic Sans MS" color="blue" size="3">hello</font>}
end
it "should allow valid MathML" do
str = %{<math xmlns="http://www.w3.org/1998/Math/MathML"><mrow><mi>a</mi><mo>+</mo><mi>b</mi></mrow></math>}
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to eq str
end
it "should strip invalid attributes from MathML" do
str = %{<math xmlns="http://www.w3.org/1998/Math/MathML"><mrow><mi foo="bar">a</mi><mo>+</mo><mi>b</mi></mrow></math>}
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).not_to match(/foo/)
end
it "should remove and not escape contents of style tags" do
str = %{<p><style type="text/css">pleaseignoreme: blahblahblah</style>but not me</p>}
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res).to eq "<p>but not me</p>"
end
it "should not be extremely slow with long, weird microsoft styles" do
str = %{<span lang="EN" style="font-family: 'Times New Roman','serif'; color: #17375e; font-size: 12pt; mso-fareast-font-family: 'Times New Roman'; mso-themecolor: text2; mso-themeshade: 191; mso-style-textfill-fill-color: #17375E; mso-style-textfill-fill-themecolor: text2; mso-style-textfill-fill-alpha: 100.0%; mso-ansi-language: EN; mso-style-textfill-fill-colortransforms: lumm=75000"><p></p></span>}
# the above string took over a minute to sanitize as of 8ae4ba8e
Timeout.timeout(1) { Sanitize.clean(str, CanvasSanitize::SANITIZE) }
end
Dir.glob(File.expand_path(File.join(__FILE__, '..', '..', 'fixtures', 'xss', '*.xss'))) do |filename|
name = File.split(filename).last
it "should sanitize xss attempts for #{name}" do
f = File.open(filename)
check = f.readline.strip
str = f.read
res = Sanitize.clean(str, CanvasSanitize::SANITIZE)
expect(res.downcase).not_to match(Regexp.new(check.downcase))
end
end
end

View File

@ -1,2 +1,2 @@
style
binding
<STYLE>BODY{-moz-binding:url("http://ha.ckers.org/xssmoz.xml#xss")}</STYLE>

View File

@ -1,2 +1,2 @@
<style>
javascript
<STYLE>li {list-style-image: url("javascript:alert('XSS')");}</STYLE><UL><LI>XSS

View File

@ -1,2 +1,2 @@
style
javascript|alert
<STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A>

View File

@ -1,2 +1,2 @@
style|css
alert|javascript
<STYLE type="text/css">BODY{background:url("javascript:alert('XSS')")}</STYLE>

Some files were not shown because too many files have changed in this diff Show More