gradebook importer files through S3

closes CNVS-27034

stops dumping the file to the delayed job parameter, which was never a
good idea.  This sends it to S3 or file storage and streams it from
there on import. With the current setup, way more memory than necessary
is consumed while the job is running, and this is the way we process
large files in places like SIS anyway so this makes it more consistent.

TEST PLAN:
 1) gradebook imports should still work
 2) gradebook import jobs should not eat memory like popcorn

Change-Id: If5bc1aab2eba01c617d6c37480fe777000d80165
Reviewed-on: https://gerrit.instructure.com/71414
Reviewed-by: Derek Bender <djbender@instructure.com>
Tested-by: Jenkins
QA-Review: KC Naegle <knaegle@instructure.com>
Product-Review: Ethan Vizitei <evizitei@instructure.com>
This commit is contained in:
Ethan Vizitei 2016-02-03 14:38:08 -07:00 committed by Ethan Vizitei
parent 0364c12169
commit db3ba60b11
6 changed files with 245 additions and 60 deletions

View File

@ -55,8 +55,7 @@ class GradebookUploadsController < ApplicationController
def create
if authorized_action(@context, @current_user, :manage_grades)
if params[:gradebook_upload]
attachment = params[:gradebook_upload][:uploaded_data]
@progress = GradebookUpload.queue_from(@context, @current_user, attachment.read)
@progress = GradebookUpload.queue_from(@context, @current_user, gradebook_upload_params)
js_env gradebook_env(@progress)
render :show
else
@ -95,4 +94,8 @@ class GradebookUploadsController < ApplicationController
user_id: @current_user
).first
end
def gradebook_upload_params
strong_params.require(:gradebook_upload).permit(:uploaded_data)
end
end

View File

@ -22,12 +22,15 @@ class GradebookUpload < ActiveRecord::Base
belongs_to :course
belongs_to :user
belongs_to :progress
has_many :attachments, as: :context, dependent: :destroy
serialize :gradebook, JSON
def self.queue_from(course, user, params)
progress = Progress.create!(:context => course, :tag => "gradebook_upload", user: user)
progress.process_job(GradebookImporter, :create_from, {}, course, user, params)
def self.queue_from(course, user, attachment_data)
progress = Progress.create!(context: course, tag: "gradebook_upload", user: user)
gradebook_upload = GradebookUpload.create!(course: course, user: user, progress: progress)
gradebook_upload_attachment = gradebook_upload.attachments.create!(attachment_data)
progress.process_job(GradebookImporter, :create_from, {}, gradebook_upload, user, gradebook_upload_attachment)
progress
end

View File

@ -37,22 +37,42 @@ class GradebookImporter
end
end
attr_reader :context, :contents, :assignments, :students, :submissions, :missing_assignments, :missing_students
attr_reader :context, :contents, :attachment, :assignments, :students,
:submissions, :missing_assignments, :missing_students, :upload
def self.create_from(progress, course, user, attachment)
uploaded_gradebook = new(course, attachment, user, progress)
uploaded_gradebook.parse!
# TODO: reduce these to "gradebook_upload" and "attachment" once job queue
# is clear of old enqueued jobs
def self.create_from(progress, gradebook_upload_or_course, user, attachment_or_contents)
self.new(gradebook_upload_or_course, attachment_or_contents, user, progress).parse!
end
def initialize(context=nil, csv=nil, user=nil, progress=nil)
raise ArgumentError, "Must provide a valid context for this gradebook." unless valid_context?(context)
raise ArgumentError, "Must provide CSV contents." unless csv
@context = context
def initialize(context=nil, csv_contents_or_attachment=nil, user=nil, progress=nil)
# TODO: change the parameter to "gradebook_upload", and remove these conditionals
# once we know the S3 pipeline works and old jobs are cleared
if context.is_a?(GradebookUpload)
@upload = context
@context = @upload.course
else
@context = context
end
raise ArgumentError, "Must provide a valid context for this gradebook." unless valid_context?(@context)
# TODO: Change this error to "Must provide attachment id when we're sure S3 pipeline works"
raise ArgumentError, "Must provide CSV contents or attachment." unless csv_contents_or_attachment
@user = user
@contents = csv
# TODO: change the parameter to "attachment", and remove these conditionals
# once we know the S3 pipeline works and old jobs are cleared
if csv_contents_or_attachment.is_a?(Attachment)
@attachment = csv_contents_or_attachment
else
@contents = csv_contents_or_attachment
end
@progress = progress
@upload = GradebookUpload.new course: @context, user: @user, progress: @progress
# TODO: remove this line entirely
# once we know the S3 pipeline works and old jobs are cleared
@upload ||= GradebookUpload.new course: @context, user: @user, progress: @progress
if @context.feature_enabled?(:differentiated_assignments)
@visible_assignments = AssignmentStudentVisibility.visible_assignment_ids_in_course_by_user(
@ -80,24 +100,19 @@ class GradebookImporter
.select(['users.id', :name, :sortable_name])
.index_by(&:id)
csv = CSV.parse(contents, :converters => :nil)
@assignments = process_header(csv)
@assignments = nil
@root_accounts = {}
@pseudonyms_by_sis_id = {}
@pseudonyms_by_login_id = {}
@students = []
@pp_row = []
csv.each do |row|
if row[0] =~ /Points Possible/
row.shift(@student_columns)
process_pp(row)
next
csv_stream do |row|
already_processed = check_for_non_student_row(row)
unless already_processed
@students << process_student(row)
process_submissions(row, @students.last)
end
next if row.compact.all? { |c| c.strip =~ /^(Muted|)$/i }
@students << process_student(row)
process_submissions(row, @students.last)
end
@assignments_outside_current_periods = []
@ -202,8 +217,7 @@ class GradebookImporter
end
end
def process_header(csv)
row = csv.shift
def process_header(row)
raise "Couldn't find header row" unless header?(row)
row = strip_non_assignment_columns(row)
@ -357,6 +371,53 @@ class GradebookImporter
end
protected
CSV_PARSE_OPTIONS = {
converters: :nil,
skip_lines: /^[, ]*$/
}.freeze
def check_for_non_student_row(row)
# check if this is the first row, a header row
if @assignments.nil?
@assignments = process_header(row)
return true
end
if row[0] =~ /Points Possible/
# this row is describing the assignment, has no student data
row.shift(@student_columns)
process_pp(row)
return true
end
if row.compact.all? { |c| c.strip =~ /^(Muted|)$/i }
# this row is muted or empty and should not be processed at all
return true
end
false # nothing unusual, signal to process as a student row
end
def csv_stream
if contents # TODO: remove this branch entirely when S3 is proved to work
CSV.parse(contents, CSV_PARSE_OPTIONS) do |row|
yield row
end
elsif attachment
csv_file = attachment.open(need_local_file: true)
# using "foreach" rather than "parse" processes a chunk of the
# file at a time rather than loading the whole file into memory
# at once, a boon for memory consumption
CSV.foreach(csv_file.path, CSV_PARSE_OPTIONS) do |row|
yield row
end
else
raise "We have no gradebook input to iterate over..."
end
end
def add_root_account_to_pseudonym_cache(root_account)
pseudonyms = root_account.shard.activate do
root_account.pseudonyms

View File

@ -32,17 +32,21 @@ describe GradebookUploadsController do
@course.reload
end
def generate_file(include_sis_id)
def generate_file(include_sis_id=false)
file = Tempfile.new("csv.csv")
file.puts(GradebookExporter.new(@course, @teacher, :include_sis_id => include_sis_id).to_csv)
file.close
file
end
def upload_gradebook_import(course, file)
data = Rack::Test::UploadedFile.new(file.path, 'text/csv', true)
post 'create', course_id: course.id, gradebook_upload: {uploaded_data: data}
end
def check_create_response(include_sis_id=false)
file = generate_file(include_sis_id)
data = Rack::Test::UploadedFile.new(file.path, 'text/csv', true)
post 'create', :course_id => @course.id, :gradebook_upload => {:uploaded_data => data}
upload_gradebook_import(@course, file)
expect(response).to be_success
end
@ -77,23 +81,34 @@ describe GradebookUploadsController do
assert_unauthorized
end
it "should accept a valid csv upload" do
user_session(@teacher)
check_create_response
end
context "with authorized teacher" do
before(:each) { user_session(@teacher) }
it "should accept a valid csv upload with a final grade column" do
user_session(@teacher)
@course.grading_standard_id = 0
@course.save!
check_create_response
end
it "should accept a valid csv upload" do
check_create_response
end
it "should accept a valid csv upload with sis id columns" do
user_session(@teacher)
@course.grading_standard_id = 0
@course.save!
check_create_response(true)
it "puts the uploaded data into a durable attachment so it's recoverable" do
gradebook_import_file = generate_file
upload_gradebook_import(@course, gradebook_import_file)
attachment = Attachment.last
expect(gradebook_import_file.path).to include(attachment.filename)
end
context "and final grade column" do
before(:each) do
@course.grading_standard_id = 0
@course.save!
end
it "should accept a valid csv upload with a final grade column" do
check_create_response
end
it "should accept a valid csv upload with sis id columns" do
check_create_response(true)
end
end
end
end

View File

@ -21,34 +21,35 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper.rb')
require 'csv'
describe GradebookImporter do
let(:gradebook_user){ user_model }
context "construction" do
let!(:gradebook_course){ course_model }
it "should require a context, usually a course" do
course = course_model
user = user_model
progress = Progress.create!(tag: "test", context: @user)
expect{GradebookImporter.new(1)}.to raise_error(ArgumentError, "Must provide a valid context for this gradebook.")
expect{GradebookImporter.new(course, valid_gradebook_contents, user, progress)}.not_to raise_error
expect{GradebookImporter.new(gradebook_course, valid_gradebook_contents, user, progress)}.not_to raise_error
end
it "should store the context and make it available" do
course_model
new_gradebook_importer
expect(@gi.context).to be_is_a(Course)
end
it "should require the contents of an upload" do
expect{GradebookImporter.new(course_model)}.to raise_error(ArgumentError, "Must provide CSV contents.")
expect{ GradebookImporter.new(gradebook_course) }.
to raise_error(ArgumentError, "Must provide CSV contents or attachment.")
end
it "should store the contents and make them available" do
course_model
new_gradebook_importer
expect(@gi.contents).not_to be_nil
end
it "should handle points possible being sorted in weird places" do
course_model
importer_with_rows(
'Student,ID,Section,Assignment 1,Final Score',
'"Blend, Bill",6,My Course,-,',
@ -60,7 +61,6 @@ describe GradebookImporter do
end
it "should handle muted line and being sorted in weird places" do
course_model
importer_with_rows(
'Student,ID,Section,Assignment 1,Final Score',
'"Blend, Bill",6,My Course,-,',
@ -73,10 +73,47 @@ describe GradebookImporter do
end
it "creates a GradebookUpload" do
course_model
new_gradebook_importer
expect(GradebookUpload.where(course_id: @course, user_id: @user)).not_to be_empty
end
context "when attachment and gradebook_upload is provided" do
let(:attachment) do
a = attachment_model
file = Tempfile.new("gradebook.csv")
file.puts("'Student,ID,Section,Assignment 1,Final Score'\n")
file.puts("\"Blend, Bill\",6,My Course,-,\n")
file.close
a.stubs(:open).returns(file)
return a
end
let(:progress){ Progress.create!(tag: "test", context: gradebook_user) }
let(:upload) do
GradebookUpload.create!(course: gradebook_course, user: gradebook_user, progress: progress)
end
let(:importer) { new_gradebook_importer(attachment, upload, gradebook_user, progress) }
it "hangs onto the provided model for streaming" do
expect(importer.attachment).to eq(attachment)
end
it "nils out contents when using an attachment (saves on memory to not parse all at once)" do
expect(importer.contents).to be_nil
end
it "keeps the provided upload rather than creating a new one" do
expect(importer.upload).to eq(upload)
end
it "sets the uploads course as the importer context" do
expect(importer.context).to eq(gradebook_course)
end
end
end
context "User lookup" do
@ -593,11 +630,11 @@ CSV
end
end
def new_gradebook_importer(contents = valid_gradebook_contents)
@user = user_model
@progress = Progress.create!(tag: "test", context: @user)
def new_gradebook_importer(contents = valid_gradebook_contents, context = @course, user = gradebook_user, progress = nil)
@user = user
@progress = progress || Progress.create!(tag: "test", context: @user)
@gi = GradebookImporter.new(@course, contents, @user, @progress)
@gi = GradebookImporter.new(context, contents, @user, @progress)
@gi.parse!
@gi
end
@ -611,5 +648,12 @@ def valid_gradebook_contents_with_sis_login_id
end
def importer_with_rows(*rows)
new_gradebook_importer(rows.join("\n"))
a = attachment_model
file = Tempfile.new("gradebook_import.csv")
rows.each do |row|
file.puts(row)
end
file.close
a.stubs(:open).returns(file)
new_gradebook_importer(a)
end

View File

@ -0,0 +1,59 @@
#
# Copyright (C) 2016 Instructure, Inc.
#
# This file is part of Canvas.
#
# Canvas is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, version 3 of the License.
#
# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
require_relative '../spec_helper'
describe GradebookUpload do
describe ".queue_from" do
let(:teacher_enrollment){ teacher_in_course }
let(:teacher){ teacher_enrollment.user }
let(:gradebook_course){ teacher_enrollment.course }
let(:attachment_data){ {dummy: "data"} }
before(:each) do
# actual attachment integration covered in gradebook_uploads_controller_spec;
# that means in the spec the dummy hash will be enqueued instead of a real attachment
# object
GradebookUpload.any_instance.stubs(attachments: stub(create!: attachment_data))
end
it "builds a progress object to track the import" do
progress = GradebookUpload.queue_from(gradebook_course, teacher, attachment_data)
expect(progress.workflow_state).to eq("queued")
expect(progress.tag).to eq("gradebook_upload")
expect(progress.context).to eq(gradebook_course)
end
it "queues a job to run the import" do
GradebookUpload.queue_from(gradebook_course, teacher, attachment_data)
job = Delayed::Job.last
expect(job.tag).to match(/GradebookImporter/)
end
it "stores the input data in an attachment on the gradebook upload" do
GradebookUpload.queue_from(gradebook_course, teacher, attachment_data)
job = Delayed::Job.last
expect(job.handler).to include("dummy: data")
end
it "creates a GradebookUpload object to represent the upload process" do
GradebookUpload.queue_from(gradebook_course, teacher, attachment_data)
upload = GradebookUpload.last
expect(upload.course).to eq(gradebook_course)
end
end
end