From db3ba60b11c353c04275372bc7713b2a3b732510 Mon Sep 17 00:00:00 2001 From: Ethan Vizitei Date: Wed, 3 Feb 2016 14:38:08 -0700 Subject: [PATCH] gradebook importer files through S3 closes CNVS-27034 stops dumping the file to the delayed job parameter, which was never a good idea. This sends it to S3 or file storage and streams it from there on import. With the current setup, way more memory than necessary is consumed while the job is running, and this is the way we process large files in places like SIS anyway so this makes it more consistent. TEST PLAN: 1) gradebook imports should still work 2) gradebook import jobs should not eat memory like popcorn Change-Id: If5bc1aab2eba01c617d6c37480fe777000d80165 Reviewed-on: https://gerrit.instructure.com/71414 Reviewed-by: Derek Bender Tested-by: Jenkins QA-Review: KC Naegle Product-Review: Ethan Vizitei --- .../gradebook_uploads_controller.rb | 7 +- app/models/gradebook_upload.rb | 9 +- lib/gradebook_importer.rb | 109 ++++++++++++++---- .../gradebook_uploads_controller_spec.rb | 51 +++++--- spec/lib/gradebook_importer_spec.rb | 70 ++++++++--- spec/models/gradebook_upload_spec.rb | 59 ++++++++++ 6 files changed, 245 insertions(+), 60 deletions(-) create mode 100644 spec/models/gradebook_upload_spec.rb diff --git a/app/controllers/gradebook_uploads_controller.rb b/app/controllers/gradebook_uploads_controller.rb index a526b4af811..408fb8e3a82 100644 --- a/app/controllers/gradebook_uploads_controller.rb +++ b/app/controllers/gradebook_uploads_controller.rb @@ -55,8 +55,7 @@ class GradebookUploadsController < ApplicationController def create if authorized_action(@context, @current_user, :manage_grades) if params[:gradebook_upload] - attachment = params[:gradebook_upload][:uploaded_data] - @progress = GradebookUpload.queue_from(@context, @current_user, attachment.read) + @progress = GradebookUpload.queue_from(@context, @current_user, gradebook_upload_params) js_env gradebook_env(@progress) render :show else @@ -95,4 +94,8 @@ class GradebookUploadsController < ApplicationController user_id: @current_user ).first end + + def gradebook_upload_params + strong_params.require(:gradebook_upload).permit(:uploaded_data) + end end diff --git a/app/models/gradebook_upload.rb b/app/models/gradebook_upload.rb index 0f7efe61717..997a7e2b3d7 100644 --- a/app/models/gradebook_upload.rb +++ b/app/models/gradebook_upload.rb @@ -22,12 +22,15 @@ class GradebookUpload < ActiveRecord::Base belongs_to :course belongs_to :user belongs_to :progress + has_many :attachments, as: :context, dependent: :destroy serialize :gradebook, JSON - def self.queue_from(course, user, params) - progress = Progress.create!(:context => course, :tag => "gradebook_upload", user: user) - progress.process_job(GradebookImporter, :create_from, {}, course, user, params) + def self.queue_from(course, user, attachment_data) + progress = Progress.create!(context: course, tag: "gradebook_upload", user: user) + gradebook_upload = GradebookUpload.create!(course: course, user: user, progress: progress) + gradebook_upload_attachment = gradebook_upload.attachments.create!(attachment_data) + progress.process_job(GradebookImporter, :create_from, {}, gradebook_upload, user, gradebook_upload_attachment) progress end diff --git a/lib/gradebook_importer.rb b/lib/gradebook_importer.rb index a86a312070d..722c8dc4597 100644 --- a/lib/gradebook_importer.rb +++ b/lib/gradebook_importer.rb @@ -37,22 +37,42 @@ class GradebookImporter end end - attr_reader :context, :contents, :assignments, :students, :submissions, :missing_assignments, :missing_students + attr_reader :context, :contents, :attachment, :assignments, :students, + :submissions, :missing_assignments, :missing_students, :upload - def self.create_from(progress, course, user, attachment) - uploaded_gradebook = new(course, attachment, user, progress) - uploaded_gradebook.parse! + # TODO: reduce these to "gradebook_upload" and "attachment" once job queue + # is clear of old enqueued jobs + def self.create_from(progress, gradebook_upload_or_course, user, attachment_or_contents) + self.new(gradebook_upload_or_course, attachment_or_contents, user, progress).parse! end - def initialize(context=nil, csv=nil, user=nil, progress=nil) - raise ArgumentError, "Must provide a valid context for this gradebook." unless valid_context?(context) - raise ArgumentError, "Must provide CSV contents." unless csv - @context = context + def initialize(context=nil, csv_contents_or_attachment=nil, user=nil, progress=nil) + # TODO: change the parameter to "gradebook_upload", and remove these conditionals + # once we know the S3 pipeline works and old jobs are cleared + if context.is_a?(GradebookUpload) + @upload = context + @context = @upload.course + else + @context = context + end + + raise ArgumentError, "Must provide a valid context for this gradebook." unless valid_context?(@context) + # TODO: Change this error to "Must provide attachment id when we're sure S3 pipeline works" + raise ArgumentError, "Must provide CSV contents or attachment." unless csv_contents_or_attachment + @user = user - @contents = csv + # TODO: change the parameter to "attachment", and remove these conditionals + # once we know the S3 pipeline works and old jobs are cleared + if csv_contents_or_attachment.is_a?(Attachment) + @attachment = csv_contents_or_attachment + else + @contents = csv_contents_or_attachment + end @progress = progress - @upload = GradebookUpload.new course: @context, user: @user, progress: @progress + # TODO: remove this line entirely + # once we know the S3 pipeline works and old jobs are cleared + @upload ||= GradebookUpload.new course: @context, user: @user, progress: @progress if @context.feature_enabled?(:differentiated_assignments) @visible_assignments = AssignmentStudentVisibility.visible_assignment_ids_in_course_by_user( @@ -80,24 +100,19 @@ class GradebookImporter .select(['users.id', :name, :sortable_name]) .index_by(&:id) - csv = CSV.parse(contents, :converters => :nil) - @assignments = process_header(csv) + @assignments = nil @root_accounts = {} @pseudonyms_by_sis_id = {} @pseudonyms_by_login_id = {} @students = [] @pp_row = [] - csv.each do |row| - if row[0] =~ /Points Possible/ - row.shift(@student_columns) - process_pp(row) - next + + csv_stream do |row| + already_processed = check_for_non_student_row(row) + unless already_processed + @students << process_student(row) + process_submissions(row, @students.last) end - - next if row.compact.all? { |c| c.strip =~ /^(Muted|)$/i } - - @students << process_student(row) - process_submissions(row, @students.last) end @assignments_outside_current_periods = [] @@ -202,8 +217,7 @@ class GradebookImporter end end - def process_header(csv) - row = csv.shift + def process_header(row) raise "Couldn't find header row" unless header?(row) row = strip_non_assignment_columns(row) @@ -357,6 +371,53 @@ class GradebookImporter end protected + + CSV_PARSE_OPTIONS = { + converters: :nil, + skip_lines: /^[, ]*$/ + }.freeze + + def check_for_non_student_row(row) + # check if this is the first row, a header row + if @assignments.nil? + @assignments = process_header(row) + return true + end + + + if row[0] =~ /Points Possible/ + # this row is describing the assignment, has no student data + row.shift(@student_columns) + process_pp(row) + return true + end + + if row.compact.all? { |c| c.strip =~ /^(Muted|)$/i } + # this row is muted or empty and should not be processed at all + return true + end + + false # nothing unusual, signal to process as a student row + end + + def csv_stream + if contents # TODO: remove this branch entirely when S3 is proved to work + CSV.parse(contents, CSV_PARSE_OPTIONS) do |row| + yield row + end + elsif attachment + csv_file = attachment.open(need_local_file: true) + # using "foreach" rather than "parse" processes a chunk of the + # file at a time rather than loading the whole file into memory + # at once, a boon for memory consumption + CSV.foreach(csv_file.path, CSV_PARSE_OPTIONS) do |row| + yield row + end + else + raise "We have no gradebook input to iterate over..." + end + end + def add_root_account_to_pseudonym_cache(root_account) pseudonyms = root_account.shard.activate do root_account.pseudonyms diff --git a/spec/controllers/gradebook_uploads_controller_spec.rb b/spec/controllers/gradebook_uploads_controller_spec.rb index 55dc29cc3ef..3f56aa224d7 100644 --- a/spec/controllers/gradebook_uploads_controller_spec.rb +++ b/spec/controllers/gradebook_uploads_controller_spec.rb @@ -32,17 +32,21 @@ describe GradebookUploadsController do @course.reload end - def generate_file(include_sis_id) + def generate_file(include_sis_id=false) file = Tempfile.new("csv.csv") file.puts(GradebookExporter.new(@course, @teacher, :include_sis_id => include_sis_id).to_csv) file.close file end + def upload_gradebook_import(course, file) + data = Rack::Test::UploadedFile.new(file.path, 'text/csv', true) + post 'create', course_id: course.id, gradebook_upload: {uploaded_data: data} + end + def check_create_response(include_sis_id=false) file = generate_file(include_sis_id) - data = Rack::Test::UploadedFile.new(file.path, 'text/csv', true) - post 'create', :course_id => @course.id, :gradebook_upload => {:uploaded_data => data} + upload_gradebook_import(@course, file) expect(response).to be_success end @@ -77,23 +81,34 @@ describe GradebookUploadsController do assert_unauthorized end - it "should accept a valid csv upload" do - user_session(@teacher) - check_create_response - end + context "with authorized teacher" do + before(:each) { user_session(@teacher) } - it "should accept a valid csv upload with a final grade column" do - user_session(@teacher) - @course.grading_standard_id = 0 - @course.save! - check_create_response - end + it "should accept a valid csv upload" do + check_create_response + end - it "should accept a valid csv upload with sis id columns" do - user_session(@teacher) - @course.grading_standard_id = 0 - @course.save! - check_create_response(true) + it "puts the uploaded data into a durable attachment so it's recoverable" do + gradebook_import_file = generate_file + upload_gradebook_import(@course, gradebook_import_file) + attachment = Attachment.last + expect(gradebook_import_file.path).to include(attachment.filename) + end + + context "and final grade column" do + before(:each) do + @course.grading_standard_id = 0 + @course.save! + end + + it "should accept a valid csv upload with a final grade column" do + check_create_response + end + + it "should accept a valid csv upload with sis id columns" do + check_create_response(true) + end + end end end diff --git a/spec/lib/gradebook_importer_spec.rb b/spec/lib/gradebook_importer_spec.rb index 5a774985749..9a76ed27341 100644 --- a/spec/lib/gradebook_importer_spec.rb +++ b/spec/lib/gradebook_importer_spec.rb @@ -21,34 +21,35 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper.rb') require 'csv' describe GradebookImporter do + let(:gradebook_user){ user_model } + context "construction" do + let!(:gradebook_course){ course_model } + it "should require a context, usually a course" do - course = course_model user = user_model progress = Progress.create!(tag: "test", context: @user) expect{GradebookImporter.new(1)}.to raise_error(ArgumentError, "Must provide a valid context for this gradebook.") - expect{GradebookImporter.new(course, valid_gradebook_contents, user, progress)}.not_to raise_error + expect{GradebookImporter.new(gradebook_course, valid_gradebook_contents, user, progress)}.not_to raise_error end it "should store the context and make it available" do - course_model new_gradebook_importer expect(@gi.context).to be_is_a(Course) end it "should require the contents of an upload" do - expect{GradebookImporter.new(course_model)}.to raise_error(ArgumentError, "Must provide CSV contents.") + expect{ GradebookImporter.new(gradebook_course) }. + to raise_error(ArgumentError, "Must provide CSV contents or attachment.") end it "should store the contents and make them available" do - course_model new_gradebook_importer expect(@gi.contents).not_to be_nil end it "should handle points possible being sorted in weird places" do - course_model importer_with_rows( 'Student,ID,Section,Assignment 1,Final Score', '"Blend, Bill",6,My Course,-,', @@ -60,7 +61,6 @@ describe GradebookImporter do end it "should handle muted line and being sorted in weird places" do - course_model importer_with_rows( 'Student,ID,Section,Assignment 1,Final Score', '"Blend, Bill",6,My Course,-,', @@ -73,10 +73,47 @@ describe GradebookImporter do end it "creates a GradebookUpload" do - course_model new_gradebook_importer expect(GradebookUpload.where(course_id: @course, user_id: @user)).not_to be_empty end + + context "when attachment and gradebook_upload is provided" do + + let(:attachment) do + a = attachment_model + file = Tempfile.new("gradebook.csv") + file.puts("'Student,ID,Section,Assignment 1,Final Score'\n") + file.puts("\"Blend, Bill\",6,My Course,-,\n") + file.close + a.stubs(:open).returns(file) + return a + end + + let(:progress){ Progress.create!(tag: "test", context: gradebook_user) } + + let(:upload) do + GradebookUpload.create!(course: gradebook_course, user: gradebook_user, progress: progress) + end + + let(:importer) { new_gradebook_importer(attachment, upload, gradebook_user, progress) } + + it "hangs onto the provided model for streaming" do + expect(importer.attachment).to eq(attachment) + end + + it "nils out contents when using an attachment (saves on memory to not parse all at once)" do + expect(importer.contents).to be_nil + end + + it "keeps the provided upload rather than creating a new one" do + expect(importer.upload).to eq(upload) + end + + it "sets the uploads course as the importer context" do + expect(importer.context).to eq(gradebook_course) + end + + end end context "User lookup" do @@ -593,11 +630,11 @@ CSV end end -def new_gradebook_importer(contents = valid_gradebook_contents) - @user = user_model - @progress = Progress.create!(tag: "test", context: @user) +def new_gradebook_importer(contents = valid_gradebook_contents, context = @course, user = gradebook_user, progress = nil) + @user = user + @progress = progress || Progress.create!(tag: "test", context: @user) - @gi = GradebookImporter.new(@course, contents, @user, @progress) + @gi = GradebookImporter.new(context, contents, @user, @progress) @gi.parse! @gi end @@ -611,5 +648,12 @@ def valid_gradebook_contents_with_sis_login_id end def importer_with_rows(*rows) - new_gradebook_importer(rows.join("\n")) + a = attachment_model + file = Tempfile.new("gradebook_import.csv") + rows.each do |row| + file.puts(row) + end + file.close + a.stubs(:open).returns(file) + new_gradebook_importer(a) end diff --git a/spec/models/gradebook_upload_spec.rb b/spec/models/gradebook_upload_spec.rb new file mode 100644 index 00000000000..506d9864314 --- /dev/null +++ b/spec/models/gradebook_upload_spec.rb @@ -0,0 +1,59 @@ +# +# Copyright (C) 2016 Instructure, Inc. +# +# This file is part of Canvas. +# +# Canvas is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, version 3 of the License. +# +# Canvas is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see . +# +require_relative '../spec_helper' + +describe GradebookUpload do + describe ".queue_from" do + let(:teacher_enrollment){ teacher_in_course } + let(:teacher){ teacher_enrollment.user } + let(:gradebook_course){ teacher_enrollment.course } + let(:attachment_data){ {dummy: "data"} } + + before(:each) do + # actual attachment integration covered in gradebook_uploads_controller_spec; + # that means in the spec the dummy hash will be enqueued instead of a real attachment + # object + GradebookUpload.any_instance.stubs(attachments: stub(create!: attachment_data)) + end + + it "builds a progress object to track the import" do + progress = GradebookUpload.queue_from(gradebook_course, teacher, attachment_data) + expect(progress.workflow_state).to eq("queued") + expect(progress.tag).to eq("gradebook_upload") + expect(progress.context).to eq(gradebook_course) + end + + it "queues a job to run the import" do + GradebookUpload.queue_from(gradebook_course, teacher, attachment_data) + job = Delayed::Job.last + expect(job.tag).to match(/GradebookImporter/) + end + + it "stores the input data in an attachment on the gradebook upload" do + GradebookUpload.queue_from(gradebook_course, teacher, attachment_data) + job = Delayed::Job.last + expect(job.handler).to include("dummy: data") + end + + it "creates a GradebookUpload object to represent the upload process" do + GradebookUpload.queue_from(gradebook_course, teacher, attachment_data) + upload = GradebookUpload.last + expect(upload.course).to eq(gradebook_course) + end + end +end