strip invalid utf-8 from AR yaml columns serialized as binary by ruby 1.8
fixes CNVS-4174 Ruby 1.8 somtimes inserted invalid utf-8 into these columns using a !binary tag, which 1.9 syck deserializes as ASCII-8BIT encoded. But then it later blows up trying to append to a UTF-8 erb view or json output. This adds a whitelisted set of columns for which we want to not only strip invalid utf-8, but convert ascii encoded strings to utf-8 strings and strip those too. These columns don't contain any binary data, so that's an OK conversion to make. Once Canvas only supports ruby 1.9, we shouldn't get any invalid utf-8 inserted even into serialized columns (YAML in 1.9 raises an exception on serialization), so we won't need to keep this list up to date with newly added columns, and we'll avoid accidentally converting a serialized column that does contain binary data to utf-8. test plan: rather than switching to ruby 1.8 and inserting bad data, it'll be easier to just munge the database directly. create a quiz_questions row with question_data that contains something like: question_name: !binary | oHRleHSg then load the quiz in the UI using ruby 1.9. verify that you see a question name "test" rather than a page error. Change-Id: I8054026a06f28b1b61e05ba187bca8403a07fbb5 Reviewed-on: https://gerrit.instructure.com/18059 Reviewed-by: Paul Hinze <paulh@instructure.com> QA-Review: Brian Palmer <brianp@instructure.com> Tested-by: Jenkins <jenkins@instructure.com>
This commit is contained in:
parent
facf63b34d
commit
b9692005b2
|
@ -110,9 +110,31 @@ else
|
|||
end
|
||||
|
||||
class ActiveRecord::Base
|
||||
# this is basically all potentially affected AR serialized columns that
|
||||
# existed in the DB before Canvas was Ruby 1.9 only. We've verified that
|
||||
# none of these columns should legitimately contain binary data, only text.
|
||||
SERIALIZED_COLUMNS_WITH_POTENTIALLY_INVALID_UTF8 = {
|
||||
'AssessmentQuestion' => %w[question_data],
|
||||
'ContextExternalTool' => %w[settings],
|
||||
'EportfolioEntry' => %w[content],
|
||||
'ErrorReport' => %w[http_env data],
|
||||
'LearningOutcome' => %w[data],
|
||||
'Profile' => %w[data],
|
||||
'Quiz' => %w[quiz_data],
|
||||
'QuizQuestion' => %w[question_data],
|
||||
'QuizSubmission' => %w[quiz_data submission_data],
|
||||
'QuizSubmissionSnapshot' => %w[data],
|
||||
'Rubric' => %w[data],
|
||||
'RubricAssessment' => %w[data],
|
||||
'SisBatch' => %w[processing_errors processing_warnings],
|
||||
'StreamItem' => %w[data],
|
||||
}
|
||||
|
||||
def unserialize_attribute_with_utf8_check(attr_name)
|
||||
value = unserialize_attribute_without_utf8_check(attr_name)
|
||||
TextHelper.recursively_strip_invalid_utf8(value)
|
||||
if SERIALIZED_COLUMNS_WITH_POTENTIALLY_INVALID_UTF8[self.class.name].try(:include?, attr_name.to_s)
|
||||
TextHelper.recursively_strip_invalid_utf8!(value, true)
|
||||
end
|
||||
value
|
||||
end
|
||||
alias_method_chain :unserialize_attribute, :utf8_check
|
||||
|
|
|
@ -439,19 +439,27 @@ def self.date_component(start_date, style=:normal)
|
|||
# make sure we won't get an invalid utf-8 error trying to save the error
|
||||
# report to the db.
|
||||
def self.strip_invalid_utf8(string)
|
||||
return string if string.nil?
|
||||
return string if string.nil?
|
||||
# add four spaces to the end of the string, because iconv with the //IGNORE
|
||||
# option will still fail on incomplete byte sequences at the end of the input
|
||||
Iconv.conv('UTF-8//IGNORE', 'UTF-8', string + ' ')[0...-4]
|
||||
# we force_encoding on the returned string because Iconv.conv returns binary.
|
||||
string = Iconv.conv('UTF-8//IGNORE', 'UTF-8', string + ' ')[0...-4]
|
||||
if string.respond_to?(:force_encoding)
|
||||
string.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
string
|
||||
end
|
||||
|
||||
def self.recursively_strip_invalid_utf8(object)
|
||||
def self.recursively_strip_invalid_utf8!(object, force_utf8 = false)
|
||||
case object
|
||||
when Hash
|
||||
object.each_value { |o| self.recursively_strip_invalid_utf8(o) }
|
||||
object.each_value { |o| self.recursively_strip_invalid_utf8!(o, force_utf8) }
|
||||
when Array
|
||||
object.each { |o| self.recursively_strip_invalid_utf8(o) }
|
||||
object.each { |o| self.recursively_strip_invalid_utf8!(o, force_utf8) }
|
||||
when String
|
||||
if object.encoding == Encoding::ASCII_8BIT && force_utf8
|
||||
object.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
if !object.valid_encoding?
|
||||
object.replace(self.strip_invalid_utf8(object))
|
||||
end
|
||||
|
|
|
@ -346,40 +346,73 @@ Ad dolore andouille meatball irure, ham hock tail exercitation minim ribeye sint
|
|||
end
|
||||
end
|
||||
|
||||
it "should recursively strip out invalid utf-8" do
|
||||
pending("ruby 1.9 only") if RUBY_VERSION < "1.9.0"
|
||||
qd = %{
|
||||
describe "YAML invalid UTF8 stripping" do
|
||||
before do
|
||||
pending("ruby 1.9 only") if RUBY_VERSION < "1.9"
|
||||
end
|
||||
|
||||
it "should recursively strip out invalid utf-8" do
|
||||
data = YAML.load(%{
|
||||
---
|
||||
answers:
|
||||
- !map:HashWithIndifferentAccess
|
||||
id: 2
|
||||
text: "t\xEAwo"
|
||||
valid_ascii: !binary |
|
||||
oHRleHSg
|
||||
}.strip)
|
||||
answer = data['answers'][0]['text']
|
||||
answer.valid_encoding?.should be_false
|
||||
TextHelper.recursively_strip_invalid_utf8!(data, true)
|
||||
answer.should == "two"
|
||||
answer.encoding.should == Encoding::UTF_8
|
||||
answer.valid_encoding?.should be_true
|
||||
|
||||
# in some edge cases, Syck will return a string as ASCII-8BIT if it's not valid UTF-8
|
||||
# so we added a force_encoding step to recursively_strip_invalid_utf8!
|
||||
ascii = data['answers'][0]['valid_ascii']
|
||||
ascii.should == 'text'
|
||||
ascii.encoding.should == Encoding::UTF_8
|
||||
end
|
||||
|
||||
it "should strip out invalid utf-8 when deserializing a column" do
|
||||
# non-binary invalid utf-8 can't even be inserted into the db in this environment,
|
||||
# so we only test the !binary case here
|
||||
yaml_blob = %{
|
||||
---
|
||||
answers:
|
||||
- !map:HashWithIndifferentAccess
|
||||
weight: 0
|
||||
id: 1
|
||||
text: one
|
||||
migration_id: QUE_1
|
||||
- !map:HashWithIndifferentAccess
|
||||
weight: 0
|
||||
id: 2
|
||||
html: abêcd.
|
||||
text: "t\xEAwo"
|
||||
valid_ascii: !binary |
|
||||
oHRleHSg
|
||||
migration_id: QUE_2
|
||||
- !map:HashWithIndifferentAccess
|
||||
weight: 100
|
||||
id: 4685
|
||||
text: three
|
||||
migration_id: QUE_0_7_6554E77AEBFE42C7B02DAE225141AB51_A2
|
||||
question_text: What is the answer
|
||||
position: 2
|
||||
}.force_encoding('binary')
|
||||
data = YAML.load(qd)
|
||||
answer = data['answers'][1]['text']
|
||||
answer.valid_encoding?.should be_false
|
||||
TextHelper.recursively_strip_invalid_utf8(data)
|
||||
answer.should == "two"
|
||||
answer.valid_encoding?.should be_true
|
||||
}.force_encoding('binary').strip
|
||||
# now actually insert it into an AR column
|
||||
aq = assessment_question_model
|
||||
AssessmentQuestion.update_all({ :question_data => yaml_blob }, { :id => aq.id })
|
||||
text = aq.reload.question_data['answers'][0]['valid_ascii']
|
||||
text.should == "text"
|
||||
text.encoding.should == Encoding::UTF_8
|
||||
end
|
||||
|
||||
# now check that the method is called on serialized AR columns
|
||||
Account.default.update_attribute(:settings, "test")
|
||||
a = Account.find(Account.default.id)
|
||||
TextHelper.expects(:recursively_strip_invalid_utf8).with({})
|
||||
a.settings # deserialization is lazy, trigger it
|
||||
describe "unserialize_attribute_with_utf8_check" do
|
||||
it "should not strip columns not on the list" do
|
||||
TextHelper.expects(:recursively_strip_invalid_utf8!).never
|
||||
a = Account.find(Account.default.id)
|
||||
a.settings # deserialization is lazy, trigger it
|
||||
end
|
||||
|
||||
it "should strip columns on the list" do
|
||||
TextHelper.unstub(:recursively_strip_invalid_utf8!)
|
||||
aq = assessment_question_model
|
||||
TextHelper.expects(:recursively_strip_invalid_utf8!).with(instance_of(HashWithIndifferentAccess), true)
|
||||
aq = AssessmentQuestion.find(aq)
|
||||
aq.question_data
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue