Add converter to utf-8 for imports
flag=none Closes SOS-2428 Test plan: - Upload archive with Users.csv in ISO encoding - Upload archive with Users.csv in ASCII encoding Change-Id: I680c0b343c3b0374fd7e207eb1f43e063b1c104a Reviewed-on: https://gerrit.instructure.com/c/canvas-lms/+/287235 Tested-by: Service Cloud Jenkins <svc.cloudjenkins@instructure.com> QA-Review: Ievgenii Matkivskiy <ievgenii.matkivskiy@instructure.com> Product-Review: Ievgenii Matkivskiy <ievgenii.matkivskiy@instructure.com> Reviewed-by: August Thornton <august@instructure.com> Reviewed-by: Balazs Komaromi <balazs.komaromi@instructure.com> Reviewed-by: Tamas Nagy <tamas.nagy@instructure.com>
This commit is contained in:
parent
8055a971d2
commit
b95dafe9b0
|
@ -124,6 +124,7 @@ gem "rack-test", "1.1.0"
|
|||
gem "rake", "13.0.3"
|
||||
gem "rails-observers", "0.1.5"
|
||||
gem "ratom-nokogiri", "0.10.11", require: false
|
||||
gem "rchardet", "1.8.0"
|
||||
gem "redcarpet", "3.5.0", require: false
|
||||
gem "retriable", "1.4.1"
|
||||
gem "ritex", "1.0.1", require: false
|
||||
|
|
|
@ -20,6 +20,9 @@
|
|||
|
||||
require "atom"
|
||||
require "crocodoc"
|
||||
require "fileutils"
|
||||
require "securerandom"
|
||||
require "rchardet"
|
||||
|
||||
# See the uploads controller and views for examples on how to use this model.
|
||||
class Attachment < ActiveRecord::Base
|
||||
|
@ -261,6 +264,67 @@ class Attachment < ActiveRecord::Base
|
|||
READ_FILE_CHUNK_SIZE
|
||||
end
|
||||
|
||||
def self.convert_attachment_encoding?(file_path, file_encoding)
|
||||
error_count = 0
|
||||
file = File.open(file_path)
|
||||
chunk = file.read(read_file_chunk_size)
|
||||
encoding_converter = Encoding::Converter.new(file_encoding, Encoding::UTF_8)
|
||||
|
||||
converted_file = File.basename(file_path, File.extname(file_path)) + "_" + SecureRandom.hex + "." + File.extname(file_path)
|
||||
converted_file_path = File.join(File.dirname(file_path), converted_file)
|
||||
converted_file = File.new(converted_file_path, "w:UTF-8")
|
||||
while chunk
|
||||
begin
|
||||
converted_chunk = encoding_converter.convert(chunk.dup)
|
||||
raise EncodingError unless converted_chunk.valid_encoding?
|
||||
|
||||
converted_file.write(converted_chunk)
|
||||
rescue EncodingError
|
||||
error_count += 1
|
||||
if !file.eof? && error_count <= 4
|
||||
# we may have split a utf-8 character in the chunk - try to resolve it, but only to a point
|
||||
chunk << file.read(1)
|
||||
next
|
||||
else
|
||||
file.close
|
||||
converted_file.close
|
||||
|
||||
File.unlink(converted_file_path)
|
||||
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
error_count = 0
|
||||
chunk = file.read(read_file_chunk_size)
|
||||
end
|
||||
|
||||
file.close
|
||||
converted_file.close
|
||||
|
||||
File.unlink(file_path)
|
||||
FileUtils.mv(converted_file_path, file_path)
|
||||
|
||||
true
|
||||
rescue EncodingError
|
||||
false
|
||||
end
|
||||
|
||||
def self.get_file_encoding?(file_path)
|
||||
content = File.open(file_path, "rb", &:read)
|
||||
CharDet.detect(content)
|
||||
end
|
||||
|
||||
def self.convert_to_utf8?(file)
|
||||
file_path = File.absolute_path(file.to_path)
|
||||
file_encoding = Attachment.get_file_encoding?(file_path)
|
||||
|
||||
return false if file_encoding["confidence"].to_d <= 0.5.to_d
|
||||
return true if Attachment.convert_attachment_encoding?(file_path, file_encoding["encoding"])
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
def self.valid_utf8?(file)
|
||||
# validate UTF-8
|
||||
chunk = file.read(read_file_chunk_size)
|
||||
|
|
|
@ -392,7 +392,7 @@ module SIS
|
|||
def process_file(base, file, att)
|
||||
csv = { base: base, file: file, fullpath: File.join(base, file), attachment: att }
|
||||
if File.file?(csv[:fullpath]) && File.extname(csv[:fullpath]).casecmp?(".csv")
|
||||
unless Attachment.valid_utf8?(File.open(csv[:fullpath]))
|
||||
unless Attachment.valid_utf8?(File.open(csv[:fullpath])) || Attachment.convert_to_utf8?(File.open(csv[:fullpath]))
|
||||
SisBatch.add_error(csv, I18n.t("Invalid UTF-8"), sis_batch: @batch, failure: true)
|
||||
return
|
||||
end
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
user_id,login_id,first_name,last_name,email,status
|
||||
10945457,10945457,Di’Nasia,Berry,Dinasia.M.Berry@live.mercer.edu,active
|
||||
11010515,11010515,Brandon,De León,Brandon.DeLen@live.mercer.edu,deleted
|
||||
11011848,11011848,André,Aguilar,,deleted
|
||||
11013722,11013722,Caleb,Rippé,Caleb.Robert.Ripp@live.mercer.edu,deleted
|
||||
11015065,11015065,Déja,Daniel,Dja.Alexandra.Daniel@live.mercer.edu,deleted
|
||||
11017595,11017595,Juan Pablo,Nava Gámez,JuanPablo.NavaGmez@live.mercer.edu,deleted
|
||||
11022257,11022257,Zephrée,Scott,Zephre.Rene.Scott@live.mercer.edu,active
|
||||
11023337,11023337,Dan’Aja,Wright,Danaja.Saniya.Wright@live.mercer.edu,active
|
|
|
@ -29,12 +29,13 @@ describe SIS::CSV::ImportRefactored do
|
|||
expect(importer.errors.first.last).to eq "Couldn't find Canvas CSV import headers"
|
||||
end
|
||||
|
||||
it "errors files with invalid UTF-8" do
|
||||
importer = process_csv_data(
|
||||
"xlist_course_id,section_id,status",
|
||||
(+"ABC2119_ccutrer_2012201_xlist,26076.20122\xA0,active").force_encoding("UTF-8")
|
||||
it "does not raise error for UTF-8 encoded texts" do
|
||||
expect do
|
||||
process_csv_data(
|
||||
"abstract_course_id,short_name,long_name,account_id,term_id,status",
|
||||
(+"C001,Hu\u0000m101,Humanities\xA0,A001,T001,active").force_encoding("UTF-8")
|
||||
)
|
||||
expect(importer.errors.first.last).to eq "Invalid UTF-8"
|
||||
end.not_to raise_error
|
||||
end
|
||||
|
||||
it "works with valid UTF-8 when split across bytes" do
|
||||
|
|
|
@ -1873,4 +1873,24 @@ describe SIS::CSV::UserImporter do
|
|||
expect(@shard1_user.email).to eq "shard1@example.com"
|
||||
end
|
||||
end
|
||||
|
||||
it "parses utf encoding" do
|
||||
expect do
|
||||
process_csv_data_cleanly(
|
||||
File.read(File.expand_path("#{File.dirname(__FILE__)}/../../../fixtures/sis/users_utf.csv"))
|
||||
)
|
||||
end.not_to raise_error
|
||||
end
|
||||
|
||||
it "parses iso-8859-1 encoding" do
|
||||
expect do
|
||||
process_csv_data(
|
||||
"user_id,login_id,password,first_name,last_name,short_name,email,status,building",
|
||||
+"P88430,SarimarCruz\xDD.21@icloud.com,,EMILIE,CRU PE\xD1A,,SarimarCruz\xDD.21@icloud.com,active,5015"
|
||||
)
|
||||
end.not_to raise_error
|
||||
|
||||
user = Pseudonym.where(sis_user_id: "P88430").first.user
|
||||
expect(user.email).to eql("SarimarCruzÝ.21@icloud.com")
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue