strip invalid utf-8 from AR yaml columns serialized as binary by ruby 1.8

fixes CNVS-4174

Ruby 1.8 somtimes inserted invalid utf-8 into these columns using a
!binary tag, which 1.9 syck deserializes as ASCII-8BIT encoded. But then
it later blows up trying to append to a UTF-8 erb view or json output.

This adds a whitelisted set of columns for which we want to not only
strip invalid utf-8, but convert ascii encoded strings to utf-8 strings
and strip those too. These columns don't contain any binary data, so
that's an OK conversion to make.

Once Canvas only supports ruby 1.9, we shouldn't get any invalid utf-8
inserted even into serialized columns (YAML in 1.9 raises an exception
on serialization), so we won't need to keep this list up to date with
newly added columns, and we'll avoid accidentally converting a
serialized column that does contain binary data to utf-8.

test plan: rather than switching to ruby 1.8 and inserting bad data,
it'll be easier to just munge the database directly. create a
quiz_questions row with question_data that contains something like:

   question_name: !binary |
     oHRleHSg

then load the quiz in the UI using ruby 1.9. verify that you see a
question name "test" rather than a page error.

Change-Id: I8054026a06f28b1b61e05ba187bca8403a07fbb5
Reviewed-on: https://gerrit.instructure.com/18059
Reviewed-by: Paul Hinze <paulh@instructure.com>
QA-Review: Brian Palmer <brianp@instructure.com>
Tested-by: Jenkins <jenkins@instructure.com>
This commit is contained in:
Brian Palmer 2013-02-25 12:34:47 -07:00
parent facf63b34d
commit b9692005b2
3 changed files with 95 additions and 32 deletions

View File

@ -110,9 +110,31 @@ else
end
class ActiveRecord::Base
# this is basically all potentially affected AR serialized columns that
# existed in the DB before Canvas was Ruby 1.9 only. We've verified that
# none of these columns should legitimately contain binary data, only text.
SERIALIZED_COLUMNS_WITH_POTENTIALLY_INVALID_UTF8 = {
'AssessmentQuestion' => %w[question_data],
'ContextExternalTool' => %w[settings],
'EportfolioEntry' => %w[content],
'ErrorReport' => %w[http_env data],
'LearningOutcome' => %w[data],
'Profile' => %w[data],
'Quiz' => %w[quiz_data],
'QuizQuestion' => %w[question_data],
'QuizSubmission' => %w[quiz_data submission_data],
'QuizSubmissionSnapshot' => %w[data],
'Rubric' => %w[data],
'RubricAssessment' => %w[data],
'SisBatch' => %w[processing_errors processing_warnings],
'StreamItem' => %w[data],
}
def unserialize_attribute_with_utf8_check(attr_name)
value = unserialize_attribute_without_utf8_check(attr_name)
TextHelper.recursively_strip_invalid_utf8(value)
if SERIALIZED_COLUMNS_WITH_POTENTIALLY_INVALID_UTF8[self.class.name].try(:include?, attr_name.to_s)
TextHelper.recursively_strip_invalid_utf8!(value, true)
end
value
end
alias_method_chain :unserialize_attribute, :utf8_check

View File

@ -439,19 +439,27 @@ def self.date_component(start_date, style=:normal)
# make sure we won't get an invalid utf-8 error trying to save the error
# report to the db.
def self.strip_invalid_utf8(string)
return string if string.nil?
return string if string.nil?
# add four spaces to the end of the string, because iconv with the //IGNORE
# option will still fail on incomplete byte sequences at the end of the input
Iconv.conv('UTF-8//IGNORE', 'UTF-8', string + ' ')[0...-4]
# we force_encoding on the returned string because Iconv.conv returns binary.
string = Iconv.conv('UTF-8//IGNORE', 'UTF-8', string + ' ')[0...-4]
if string.respond_to?(:force_encoding)
string.force_encoding(Encoding::UTF_8)
end
string
end
def self.recursively_strip_invalid_utf8(object)
def self.recursively_strip_invalid_utf8!(object, force_utf8 = false)
case object
when Hash
object.each_value { |o| self.recursively_strip_invalid_utf8(o) }
object.each_value { |o| self.recursively_strip_invalid_utf8!(o, force_utf8) }
when Array
object.each { |o| self.recursively_strip_invalid_utf8(o) }
object.each { |o| self.recursively_strip_invalid_utf8!(o, force_utf8) }
when String
if object.encoding == Encoding::ASCII_8BIT && force_utf8
object.force_encoding(Encoding::UTF_8)
end
if !object.valid_encoding?
object.replace(self.strip_invalid_utf8(object))
end

View File

@ -346,40 +346,73 @@ Ad dolore andouille meatball irure, ham hock tail exercitation minim ribeye sint
end
end
it "should recursively strip out invalid utf-8" do
pending("ruby 1.9 only") if RUBY_VERSION < "1.9.0"
qd = %{
describe "YAML invalid UTF8 stripping" do
before do
pending("ruby 1.9 only") if RUBY_VERSION < "1.9"
end
it "should recursively strip out invalid utf-8" do
data = YAML.load(%{
---
answers:
- !map:HashWithIndifferentAccess
id: 2
text: "t\xEAwo"
valid_ascii: !binary |
oHRleHSg
}.strip)
answer = data['answers'][0]['text']
answer.valid_encoding?.should be_false
TextHelper.recursively_strip_invalid_utf8!(data, true)
answer.should == "two"
answer.encoding.should == Encoding::UTF_8
answer.valid_encoding?.should be_true
# in some edge cases, Syck will return a string as ASCII-8BIT if it's not valid UTF-8
# so we added a force_encoding step to recursively_strip_invalid_utf8!
ascii = data['answers'][0]['valid_ascii']
ascii.should == 'text'
ascii.encoding.should == Encoding::UTF_8
end
it "should strip out invalid utf-8 when deserializing a column" do
# non-binary invalid utf-8 can't even be inserted into the db in this environment,
# so we only test the !binary case here
yaml_blob = %{
---
answers:
- !map:HashWithIndifferentAccess
weight: 0
id: 1
text: one
migration_id: QUE_1
- !map:HashWithIndifferentAccess
weight: 0
id: 2
html: ab&ecirc;cd.
text: "t\xEAwo"
valid_ascii: !binary |
oHRleHSg
migration_id: QUE_2
- !map:HashWithIndifferentAccess
weight: 100
id: 4685
text: three
migration_id: QUE_0_7_6554E77AEBFE42C7B02DAE225141AB51_A2
question_text: What is the answer
position: 2
}.force_encoding('binary')
data = YAML.load(qd)
answer = data['answers'][1]['text']
answer.valid_encoding?.should be_false
TextHelper.recursively_strip_invalid_utf8(data)
answer.should == "two"
answer.valid_encoding?.should be_true
}.force_encoding('binary').strip
# now actually insert it into an AR column
aq = assessment_question_model
AssessmentQuestion.update_all({ :question_data => yaml_blob }, { :id => aq.id })
text = aq.reload.question_data['answers'][0]['valid_ascii']
text.should == "text"
text.encoding.should == Encoding::UTF_8
end
# now check that the method is called on serialized AR columns
Account.default.update_attribute(:settings, "test")
a = Account.find(Account.default.id)
TextHelper.expects(:recursively_strip_invalid_utf8).with({})
a.settings # deserialization is lazy, trigger it
describe "unserialize_attribute_with_utf8_check" do
it "should not strip columns not on the list" do
TextHelper.expects(:recursively_strip_invalid_utf8!).never
a = Account.find(Account.default.id)
a.settings # deserialization is lazy, trigger it
end
it "should strip columns on the list" do
TextHelper.unstub(:recursively_strip_invalid_utf8!)
aq = assessment_question_model
TextHelper.expects(:recursively_strip_invalid_utf8!).with(instance_of(HashWithIndifferentAccess), true)
aq = AssessmentQuestion.find(aq)
aq.question_data
end
end
end
end