Add converter to utf-8 for imports

flag=none
Closes SOS-2428

Test plan:
 - Upload archive with Users.csv in ISO encoding
 - Upload archive with Users.csv in ASCII encoding

Change-Id: I680c0b343c3b0374fd7e207eb1f43e063b1c104a
Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/287235
Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com>
QA-Review: Ievgenii Matkivskiy <ievgenii.matkivskiy@instructure.com>
Product-Review: Ievgenii Matkivskiy <ievgenii.matkivskiy@instructure.com>
Reviewed-by: August Thornton <august@instructure.com>
Reviewed-by: Balazs Komaromi <balazs.komaromi@instructure.com>
Reviewed-by: Tamas Nagy <tamas.nagy@instructure.com>
This commit is contained in:
Ievgenii Matkivskyi 2022-03-16 20:55:50 +02:00 committed by Ievgenii Matkivskiy
parent 8055a971d2
commit b95dafe9b0
6 changed files with 102 additions and 7 deletions

View File

@ -124,6 +124,7 @@ gem "rack-test", "1.1.0"
gem "rake", "13.0.3"
gem "rails-observers", "0.1.5"
gem "ratom-nokogiri", "0.10.11", require: false
gem "rchardet", "1.8.0"
gem "redcarpet", "3.5.0", require: false
gem "retriable", "1.4.1"
gem "ritex", "1.0.1", require: false

View File

@ -20,6 +20,9 @@
require "atom"
require "crocodoc"
require "fileutils"
require "securerandom"
require "rchardet"
# See the uploads controller and views for examples on how to use this model.
class Attachment < ActiveRecord::Base
@ -261,6 +264,67 @@ class Attachment < ActiveRecord::Base
READ_FILE_CHUNK_SIZE
end
def self.convert_attachment_encoding?(file_path, file_encoding)
error_count = 0
file = File.open(file_path)
chunk = file.read(read_file_chunk_size)
encoding_converter = Encoding::Converter.new(file_encoding, Encoding::UTF_8)
converted_file = File.basename(file_path, File.extname(file_path)) + "_" + SecureRandom.hex + "." + File.extname(file_path)
converted_file_path = File.join(File.dirname(file_path), converted_file)
converted_file = File.new(converted_file_path, "w:UTF-8")
while chunk
begin
converted_chunk = encoding_converter.convert(chunk.dup)
raise EncodingError unless converted_chunk.valid_encoding?
converted_file.write(converted_chunk)
rescue EncodingError
error_count += 1
if !file.eof? && error_count <= 4
# we may have split a utf-8 character in the chunk - try to resolve it, but only to a point
chunk << file.read(1)
next
else
file.close
converted_file.close
File.unlink(converted_file_path)
raise
end
end
error_count = 0
chunk = file.read(read_file_chunk_size)
end
file.close
converted_file.close
File.unlink(file_path)
FileUtils.mv(converted_file_path, file_path)
true
rescue EncodingError
false
end
def self.get_file_encoding?(file_path)
content = File.open(file_path, "rb", &:read)
CharDet.detect(content)
end
def self.convert_to_utf8?(file)
file_path = File.absolute_path(file.to_path)
file_encoding = Attachment.get_file_encoding?(file_path)
return false if file_encoding["confidence"].to_d <= 0.5.to_d
return true if Attachment.convert_attachment_encoding?(file_path, file_encoding["encoding"])
false
end
def self.valid_utf8?(file)
# validate UTF-8
chunk = file.read(read_file_chunk_size)

View File

@ -392,7 +392,7 @@ module SIS
def process_file(base, file, att)
csv = { base: base, file: file, fullpath: File.join(base, file), attachment: att }
if File.file?(csv[:fullpath]) && File.extname(csv[:fullpath]).casecmp?(".csv")
unless Attachment.valid_utf8?(File.open(csv[:fullpath]))
unless Attachment.valid_utf8?(File.open(csv[:fullpath])) || Attachment.convert_to_utf8?(File.open(csv[:fullpath]))
SisBatch.add_error(csv, I18n.t("Invalid UTF-8"), sis_batch: @batch, failure: true)
return
end

9
spec/fixtures/sis/users_utf.csv vendored Normal file
View File

@ -0,0 +1,9 @@
user_id,login_id,first_name,last_name,email,status
10945457,10945457,DiNasia,Berry,Dinasia.M.Berry@live.mercer.edu,active
11010515,11010515,Brandon,De León,Brandon.DeLen@live.mercer.edu,deleted
11011848,11011848,André,Aguilar,,deleted
11013722,11013722,Caleb,Rippé,Caleb.Robert.Ripp@live.mercer.edu,deleted
11015065,11015065,Déja,Daniel,Dja.Alexandra.Daniel@live.mercer.edu,deleted
11017595,11017595,Juan Pablo,Nava Gámez,JuanPablo.NavaGmez@live.mercer.edu,deleted
11022257,11022257,Zephrée,Scott,Zephre.Rene.Scott@live.mercer.edu,active
11023337,11023337,DanAja,Wright,Danaja.Saniya.Wright@live.mercer.edu,active
1 user_id login_id first_name last_name email status
2 10945457 10945457 Di’Nasia Berry Dinasia.M.Berry@live.mercer.edu active
3 11010515 11010515 Brandon De León Brandon.DeLen@live.mercer.edu deleted
4 11011848 11011848 André Aguilar deleted
5 11013722 11013722 Caleb Rippé Caleb.Robert.Ripp@live.mercer.edu deleted
6 11015065 11015065 Déja Daniel Dja.Alexandra.Daniel@live.mercer.edu deleted
7 11017595 11017595 Juan Pablo Nava Gámez JuanPablo.NavaGmez@live.mercer.edu deleted
8 11022257 11022257 Zephrée Scott Zephre.Rene.Scott@live.mercer.edu active
9 11023337 11023337 Dan’Aja Wright Danaja.Saniya.Wright@live.mercer.edu active

View File

@ -29,12 +29,13 @@ describe SIS::CSV::ImportRefactored do
expect(importer.errors.first.last).to eq "Couldn't find Canvas CSV import headers"
end
it "errors files with invalid UTF-8" do
importer = process_csv_data(
"xlist_course_id,section_id,status",
(+"ABC2119_ccutrer_2012201_xlist,26076.20122\xA0,active").force_encoding("UTF-8")
it "does not raise error for UTF-8 encoded texts" do
expect do
process_csv_data(
"abstract_course_id,short_name,long_name,account_id,term_id,status",
(+"C001,Hu\u0000m101,Humanities\xA0,A001,T001,active").force_encoding("UTF-8")
)
expect(importer.errors.first.last).to eq "Invalid UTF-8"
end.not_to raise_error
end
it "works with valid UTF-8 when split across bytes" do

View File

@ -1873,4 +1873,24 @@ describe SIS::CSV::UserImporter do
expect(@shard1_user.email).to eq "shard1@example.com"
end
end
it "parses utf encoding" do
expect do
process_csv_data_cleanly(
File.read(File.expand_path("#{File.dirname(__FILE__)}/../../../fixtures/sis/users_utf.csv"))
)
end.not_to raise_error
end
it "parses iso-8859-1 encoding" do
expect do
process_csv_data(
"user_id,login_id,password,first_name,last_name,short_name,email,status,building",
+"P88430,SarimarCruz\xDD.21@icloud.com,,EMILIE,CRU PE\xD1A,,SarimarCruz\xDD.21@icloud.com,active,5015"
)
end.not_to raise_error
user = Pseudonym.where(sis_user_id: "P88430").first.user
expect(user.email).to eql("SarimarCruzÝ.21@icloud.com")
end
end