From df0793abb929e66606fa25f3875ff1b89de5ad32 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 19 Jan 2012 09:20:09 +0100 Subject: [PATCH 01/16] block,cfq: change code order cfq_slice_expired will change saved_workload_slice. It should be called first so saved_workload_slice is correctly set to 0 after workload type is changed. This fixes the code order changed by 54b466e44b1c7. Tested-by: Tetsuo Handa Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee55019066a1..da21c24dbed3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3117,17 +3117,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + enum wl_type_t old_type = cfqq_type(cfqd->active_queue); + cfq_log_cfqq(cfqd, cfqq, "preempt"); + cfq_slice_expired(cfqd, 1); /* * workload type is changed, don't save slice, otherwise preempt * doesn't happen */ - if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq)) + if (old_type != cfqq_type(cfqq)) cfqq->cfqg->saved_workload_slice = 0; - cfq_slice_expired(cfqd, 1); - /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. From 05c30b9551f1904d9950ad0d28e65fc4ff3c8a8e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 19 Jan 2012 09:20:10 +0100 Subject: [PATCH 02/16] block: fix NULL icq_cache reference Vivek reported a kernel crash: [ 94.217015] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c [ 94.218004] IP: [] kmem_cache_free+0x5e/0x200 [ 94.218004] PGD 13abda067 PUD 137d52067 PMD 0 [ 94.218004] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 94.218004] CPU 0 [ 94.218004] Modules linked in: [last unloaded: scsi_wait_scan] [ 94.218004] [ 94.218004] Pid: 0, comm: swapper/0 Not tainted 3.2.0+ #16 Hewlett-Packard HP xw6600 Workstation/0A9Ch [ 94.218004] RIP: 0010:[] [] kmem_cache_free+0x5e/0x200 [ 94.218004] RSP: 0018:ffff88013fc03de0 EFLAGS: 00010006 [ 94.218004] RAX: ffffffff81e0d020 RBX: ffff880138b3c680 RCX: 00000001801c001b [ 94.218004] RDX: 00000000003aac1d RSI: ffff880138b3c680 RDI: ffffffff81142fae [ 94.218004] RBP: ffff88013fc03e10 R08: ffff880137830238 R09: 0000000000000001 [ 94.218004] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 94.218004] R13: ffffea0004e2cf00 R14: ffffffff812f6eb6 R15: 0000000000000246 [ 94.218004] FS: 0000000000000000(0000) GS:ffff88013fc00000(0000) knlGS:0000000000000000 [ 94.218004] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 94.218004] CR2: 000000000000001c CR3: 00000001395ab000 CR4: 00000000000006f0 [ 94.218004] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 94.218004] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 94.218004] Process swapper/0 (pid: 0, threadinfo ffffffff81e00000, task ffffffff81e0d020) [ 94.218004] Stack: [ 94.218004] 0000000000000102 ffff88013fc0db20 ffffffff81e22700 ffff880139500f00 [ 94.218004] 0000000000000001 000000000000000a ffff88013fc03e20 ffffffff812f6eb6 [ 94.218004] ffff88013fc03e90 ffffffff810c8da2 ffffffff81e01fd8 ffff880137830240 [ 94.218004] Call Trace: [ 94.218004] [ 94.218004] [] icq_free_icq_rcu+0x16/0x20 [ 94.218004] [] __rcu_process_callbacks+0x1c2/0x420 [ 94.218004] [] rcu_process_callbacks+0x38/0x250 [ 94.218004] [] __do_softirq+0xce/0x3e0 [ 94.218004] [] ? clockevents_program_event+0x74/0x100 [ 94.218004] [] ? tick_program_event+0x24/0x30 [ 94.218004] [] call_softirq+0x1c/0x30 [ 94.218004] [] do_softirq+0x8d/0xc0 [ 94.218004] [] irq_exit+0xae/0xe0 [ 94.218004] [] smp_apic_timer_interrupt+0x6e/0x99 [ 94.218004] [] apic_timer_interrupt+0x70/0x80 Once a queue is quiesced, it's not supposed to have any elvpriv data or icq's, and elevator switching depends on that. Request alloc path followed the rule for elvpriv data but forgot apply it to icq's leading to the following crash during elevator switch. Fix it by not allocating icq's if ELVPRIV is not set for the request. Reported-by: Vivek Goyal Tested-by: Vivek Goyal Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index e6c05a97ee2b..636702575118 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -872,13 +872,15 @@ retry: spin_unlock_irq(q->queue_lock); /* create icq if missing */ - if (unlikely(et->icq_cache && !icq)) + if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { icq = ioc_create_icq(q, gfp_mask); + if (!icq) + goto fail_icq; + } - /* rqs are guaranteed to have icq on elv_set_request() if requested */ - if (likely(!et->icq_cache || icq)) - rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); +fail_icq: if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything From 9fa73472ddbcd3da87d35a7f4566eaaf345f798e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 6 Feb 2012 08:57:29 +0100 Subject: [PATCH 03/16] block: fix ioc locking warning Meelis reported a warning: WARNING: at kernel/timer.c:1122 run_timer_softirq+0x199/0x1ec() Hardware name: 939Dual-SATA2 timer: cfq_idle_slice_timer+0x0/0xaa preempt leak: 00000102 -> 00000103 Modules linked in: sr_mod cdrom videodev media drm_kms_helper ohci_hcd ehci_hcd v4l2_compat_ioctl32 usbcore i2c_ali15x3 snd_seq drm snd_timer snd_seq Pid: 0, comm: swapper Not tainted 3.3.0-rc2-00110-gd125666 #176 Call Trace: [] warn_slowpath_common+0x7e/0x96 [] ? cfq_slice_expired+0x1d/0x1d [] warn_slowpath_fmt+0x41/0x43 [] ? cfq_idle_slice_timer+0xa1/0xaa [] ? cfq_slice_expired+0x1d/0x1d [] run_timer_softirq+0x199/0x1ec [] ? timekeeping_get_ns+0x12/0x31 [] ? apic_write+0x11/0x13 [] __do_softirq+0x74/0xfa [] call_softirq+0x1a/0x30 [] do_softirq+0x31/0x68 [] irq_exit+0x3d/0xa3 [] smp_apic_timer_interrupt+0x6b/0x77 [] apic_timer_interrupt+0x69/0x70 [] ? sched_clock_cpu+0x73/0x7d [] ? sched_clock_cpu+0x73/0x7d [] ? default_idle+0x1e/0x32 [] ? default_idle+0x18/0x32 [] cpu_idle+0x87/0xd1 [] rest_init+0x85/0x89 [] start_kernel+0x2eb/0x2f8 [] x86_64_start_reservations+0x7e/0x82 [] x86_64_start_kernel+0xf0/0xf7 this_q == locked_q is possible. There are two problems here: 1. In UP case, there is preemption counter issue as spin_trylock always successes. 2. In SMP case, the loop breaks too earlier. Signed-off-by: Shaohua Li Reported-by: Meelis Roos Reported-by: Knut Petersen Tested-by: Knut Petersen Signed-off-by: Jens Axboe --- block/blk-ioc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 27a06e00eaec..7490b6da2453 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -204,7 +204,9 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q) spin_unlock(last_q->queue_lock); last_q = NULL; - if (!spin_trylock(this_q->queue_lock)) + /* spin_trylock() always successes in UP case */ + if (this_q != locked_q && + !spin_trylock(this_q->queue_lock)) break; last_q = this_q; continue; From 822bfa51ce44f2c63c300fdb76dc99c4d5a5ca9f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Feb 2012 10:20:45 +0100 Subject: [PATCH 04/16] cdrom: use copy_to_user() without the underscores "nframes" comes from the user and "nframes * CD_FRAMESIZE_RAW" can wrap on 32 bit systems. That would have been ok if we used the same wrapped value for the copy, but we use a shifted value. We should just use the checked version of copy_to_user() because it's not going to make a difference to the speed. Cc: stable@vger.kernel.com Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 55eaf474d32c..8fefe59f52a7 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2119,11 +2119,6 @@ static int cdrom_read_cdda_old(struct cdrom_device_info *cdi, __u8 __user *ubuf, if (!nr) return -ENOMEM; - if (!access_ok(VERIFY_WRITE, ubuf, nframes * CD_FRAMESIZE_RAW)) { - ret = -EFAULT; - goto out; - } - cgc.data_direction = CGC_DATA_READ; while (nframes > 0) { if (nr > nframes) @@ -2132,7 +2127,7 @@ static int cdrom_read_cdda_old(struct cdrom_device_info *cdi, __u8 __user *ubuf, ret = cdrom_read_block(cdi, &cgc, lba, nr, 1, CD_FRAMESIZE_RAW); if (ret) break; - if (__copy_to_user(ubuf, cgc.buffer, CD_FRAMESIZE_RAW * nr)) { + if (copy_to_user(ubuf, cgc.buffer, CD_FRAMESIZE_RAW * nr)) { ret = -EFAULT; break; } @@ -2140,7 +2135,6 @@ static int cdrom_read_cdda_old(struct cdrom_device_info *cdi, __u8 __user *ubuf, nframes -= nr; lba += nr; } -out: kfree(cgc.buffer); return ret; } From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: [PATCH 05/16] block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 2 +- block/blk-ioc.c | 92 +++++---------------------------------- block/cfq-iosched.c | 2 +- fs/ioprio.c | 2 +- include/linux/blkdev.h | 3 -- include/linux/iocontext.h | 5 +-- kernel/fork.c | 2 +- 8 files changed, 18 insertions(+), 92 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..75642a352a8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1659,7 +1659,7 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc, NULL); + put_io_context(ioc); } } } diff --git a/block/blk-core.c b/block/blk-core.c index 636702575118..532b3a21b383 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -642,7 +642,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc, q); + put_io_context(rq->elv.icq->ioc); } mempool_free(rq, q->rq.rq_pool); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 7490b6da2453..9884fd7427fe 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,21 +29,6 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -/* - * Releasing ioc may nest into another put_io_context() leading to nested - * fast path release. As the ioc's can't be the same, this is okay but - * makes lockdep whine. Keep track of nesting and use it as subclass. - */ -#ifdef CONFIG_LOCKDEP -#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) -#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ -#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- -#else -#define ioc_release_depth(q) 0 -#define ioc_release_depth_inc(q) do { } while (0) -#define ioc_release_depth_dec(q) do { } while (0) -#endif - static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); @@ -75,11 +60,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) { - ioc_release_depth_inc(q); + if (et->ops.elevator_exit_icq_fn) et->ops.elevator_exit_icq_fn(icq); - ioc_release_depth_dec(q); - } /* * @icq->q might have gone away by the time RCU callback runs @@ -149,81 +131,29 @@ static void ioc_release_fn(struct work_struct *work) /** * put_io_context - put a reference of io_context * @ioc: io_context to put - * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. If the caller is holding queue_lock of a queue, it can indicate - * that with @locked_q. This is an optimization hint and the caller is - * allowed to pass in %NULL even when it's holding a queue_lock. + * zero. */ -void put_io_context(struct io_context *ioc, struct request_queue *locked_q) +void put_io_context(struct io_context *ioc) { - struct request_queue *last_q = locked_q; unsigned long flags; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - if (locked_q) - lockdep_assert_held(locked_q->queue_lock); - - if (!atomic_long_dec_and_test(&ioc->refcount)) - return; /* - * Destroy @ioc. This is a bit messy because icq's are chained - * from both ioc and queue, and ioc->lock nests inside queue_lock. - * The inner ioc->lock should be held to walk our icq_list and then - * for each icq the outer matching queue_lock should be grabbed. - * ie. We need to do reverse-order double lock dancing. - * - * Another twist is that we are often called with one of the - * matching queue_locks held as indicated by @locked_q, which - * prevents performing double-lock dance for other queues. - * - * So, we do it in two stages. The fast path uses the queue_lock - * the caller is holding and, if other queues need to be accessed, - * uses trylock to avoid introducing locking dependency. This can - * handle most cases, especially if @ioc was performing IO on only - * single device. - * - * If trylock doesn't cut it, we defer to @ioc->release_work which - * can do all the double-locking dancing. + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. */ - spin_lock_irqsave_nested(&ioc->lock, flags, - ioc_release_depth(locked_q)); - - while (!hlist_empty(&ioc->icq_list)) { - struct io_cq *icq = hlist_entry(ioc->icq_list.first, - struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - last_q = NULL; - - /* spin_trylock() always successes in UP case */ - if (this_q != locked_q && - !spin_trylock(this_q->queue_lock)) - break; - last_q = this_q; - continue; - } - ioc_exit_icq(icq); + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + schedule_work(&ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); } - - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - - spin_unlock_irqrestore(&ioc->lock, flags); - - /* if no icq is left, we're done; otherwise, kick release_work */ - if (hlist_empty(&ioc->icq_list)) - kmem_cache_free(iocontext_cachep, ioc); - else - schedule_work(&ioc->release_work); } EXPORT_SYMBOL(put_io_context); @@ -238,7 +168,7 @@ void exit_io_context(struct task_struct *task) task_unlock(task); atomic_dec(&ioc->nr_tasks); - put_io_context(ioc, NULL); + put_io_context(ioc); } /** diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da21c24dbed3..5684df6848bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1794,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e5..0f1b9515213b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc, NULL); + put_io_context(ioc); } return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c6a1f008065..606cf339bb56 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -399,9 +399,6 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif -#ifdef CONFIG_LOCKDEP - int ioc_release_depth; -#endif }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 7e1371c4bccf..119773eebe31 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -133,7 +133,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK -void put_io_context(struct io_context *ioc, struct request_queue *locked_q); +void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); @@ -141,8 +141,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); #else struct io_context; -static inline void put_io_context(struct io_context *ioc, - struct request_queue *locked_q) { } +static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c1..c574aefa8d1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; From 4e8670e26135d8fbfd5e084fddc1a8ed9f8eb4cb Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Tue, 7 Feb 2012 07:54:31 +0100 Subject: [PATCH 06/16] mtip32xx: removed the irrelevant argument of mtip_hw_submit_io() and the unused member of struct driver_data Removed the following: * irrelevant argument 'barrier' of mtip_hw_submit_io() * unused member 'eh_active' of struct driver_data Signed-off-by: Asai Thambi S P Signed-off-by: Sam Bradshaw Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 11 +++++------ drivers/block/mtip32xx/mtip32xx.h | 5 ----- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b74eab70c3d0..8eb81c96608f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2068,8 +2068,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * when the read completes. * @data Callback data passed to the callback function * when the read completes. - * @barrier If non-zero, this command must be completed before - * issuing any other commands. * @dir Direction (read or write) * * return value @@ -2077,7 +2075,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, int nsect, int nents, int tag, void *callback, - void *data, int barrier, int dir) + void *data, int dir) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2108,8 +2106,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, *((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF); *((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF); fis->device = 1 << 6; - if (barrier) - fis->device |= FUA_BIT; fis->features = nsect & 0xFF; fis->features_ex = (nsect >> 8) & 0xFF; fis->sect_count = ((tag << 3) | (tag >> 5)); @@ -3087,7 +3083,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) tag, bio_endio, bio, - bio->bi_rw & REQ_FUA, bio_data_dir(bio)); } else bio_io_error(bio); @@ -3187,6 +3182,10 @@ skip_create_disk: blk_queue_max_segments(dd->queue, MTIP_MAX_SG); blk_queue_physical_block_size(dd->queue, 4096); blk_queue_io_min(dd->queue, 4096); + /* + * write back cache is not supported in the device. FUA depends on + * write back cache support, hence setting flush support to zero. + */ blk_queue_flush(dd->queue, 0); /* Set the capacity of the device in 512 byte sectors. */ diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 723d7c4946dc..e0554a8f2233 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -104,9 +104,6 @@ /* BAR number used to access the HBA registers. */ #define MTIP_ABAR 5 -/* Forced Unit Access Bit */ -#define FUA_BIT 0x80 - #ifdef DEBUG #define dbg_printk(format, arg...) \ printk(pr_fmt(format), ##arg); @@ -415,8 +412,6 @@ struct driver_data { atomic_t resumeflag; /* Atomic variable to track suspend/resume */ - atomic_t eh_active; /* Flag for error handling tracking */ - struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ }; From 050c8ea80e3e90019d9e981c6a117ef614e882ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2012 09:19:38 +0100 Subject: [PATCH 07/16] block: separate out blk_rq_merge_ok() and blk_try_merge() from elevator functions blk_rq_merge_ok() is the elevator-neutral part of merge eligibility test. blk_try_merge() determines merge direction and expects the caller to have tested elv_rq_merge_ok() previously. elv_rq_merge_ok() now wraps blk_rq_merge_ok() and then calls elv_iosched_allow_merge(). elv_try_merge() is removed and the two callers are updated to call elv_rq_merge_ok() explicitly followed by blk_try_merge(). While at it, make rq_merge_ok() functions return bool. This is to prepare for plug merge update and doesn't introduce any behavior change. This is based on Jens' patch to skip elevator_allow_merge_fn() from plug merge. Signed-off-by: Tejun Heo LKML-Reference: <4F16F3CA.90904@kernel.dk> Original-patch-by: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +-- block/blk-merge.c | 37 +++++++++++++++++++++++++++ block/blk.h | 2 ++ block/elevator.c | 55 +++------------------------------------- include/linux/elevator.h | 3 +-- 5 files changed, 46 insertions(+), 55 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 532b3a21b383..fa697bf691eb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1282,10 +1282,10 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, (*request_count)++; - if (rq->q != q) + if (rq->q != q || !elv_rq_merge_ok(rq, bio)) continue; - el_ret = elv_try_merge(rq, bio); + el_ret = blk_try_merge(rq, bio); if (el_ret == ELEVATOR_BACK_MERGE) { ret = bio_attempt_back_merge(q, rq, bio); if (ret) diff --git a/block/blk-merge.c b/block/blk-merge.c index cfcc37cb222b..160035f54882 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -471,3 +471,40 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, { return attempt_merge(q, rq, next); } + +bool blk_rq_merge_ok(struct request *rq, struct bio *bio) +{ + if (!rq_mergeable(rq)) + return false; + + /* don't merge file system requests and discard requests */ + if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) + return false; + + /* don't merge discard requests and secure discard requests */ + if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) + return false; + + /* different data direction or already started, don't merge */ + if (bio_data_dir(bio) != rq_data_dir(rq)) + return false; + + /* must be same device and not a special request */ + if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) + return false; + + /* only merge integrity protected bio into ditto rq */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return false; + + return true; +} + +int blk_try_merge(struct request *rq, struct bio *bio) +{ + if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector) + return ELEVATOR_BACK_MERGE; + else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector) + return ELEVATOR_FRONT_MERGE; + return ELEVATOR_NO_MERGE; +} diff --git a/block/blk.h b/block/blk.h index 7efd772336de..9c12f80882b0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -137,6 +137,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); +bool blk_rq_merge_ok(struct request *rq, struct bio *bio); +int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 91e18f8af9be..f016855a46b0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -70,39 +70,9 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) /* * can we safely merge with this request? */ -int elv_rq_merge_ok(struct request *rq, struct bio *bio) +bool elv_rq_merge_ok(struct request *rq, struct bio *bio) { - if (!rq_mergeable(rq)) - return 0; - - /* - * Don't merge file system requests and discard requests - */ - if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) - return 0; - - /* - * Don't merge discard requests and secure discard requests - */ - if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) - return 0; - - /* - * different data direction or already started, don't merge - */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return 0; - - /* - * must be same device and not a special request - */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) - return 0; - - /* - * only merge integrity protected bio into ditto rq - */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (!blk_rq_merge_ok(rq, bio)) return 0; if (!elv_iosched_allow_merge(rq, bio)) @@ -112,23 +82,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_rq_merge_ok); -int elv_try_merge(struct request *__rq, struct bio *bio) -{ - int ret = ELEVATOR_NO_MERGE; - - /* - * we can merge and sequence is ok, check if it's possible - */ - if (elv_rq_merge_ok(__rq, bio)) { - if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) - ret = ELEVATOR_BACK_MERGE; - else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) - ret = ELEVATOR_FRONT_MERGE; - } - - return ret; -} - static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; @@ -478,8 +431,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) /* * First try one-hit cache. */ - if (q->last_merge) { - ret = elv_try_merge(q->last_merge, bio); + if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { + ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index c24f3d7fbf1e..d6dfb65c8885 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -122,7 +122,6 @@ extern void elv_dispatch_add_tail(struct request_queue *, struct request *); extern void elv_add_request(struct request_queue *, struct request *, int); extern void __elv_add_request(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); -extern int elv_try_merge(struct request *, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, int); @@ -155,7 +154,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); extern void elevator_exit(struct elevator_queue *); extern int elevator_change(struct request_queue *, const char *); -extern int elv_rq_merge_ok(struct request *, struct bio *); +extern bool elv_rq_merge_ok(struct request *, struct bio *); /* * Helper functions. From 07c2bd37350c9b1af71b35d05f16e300a6602948 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2012 09:19:42 +0100 Subject: [PATCH 08/16] block: don't call elevator callbacks for plug merges Plug merge calls two elevator callbacks outside queue lock - elevator_allow_merge_fn() and elevator_bio_merged_fn(). Although attempt_plug_merge() suggests that elevator is guaranteed to be there through the existing request on the plug list, nothing prevents plug merge from calling into dying or initializing elevator. For regular merges, bypass ensures elvpriv count to reach zero, which in turn prevents merges as all !ELVPRIV requests get REQ_SOFTBARRIER from forced back insertion. Plug merge doesn't check ELVPRIV, and, as the requests haven't gone through elevator insertion yet, it doesn't have SOFTBARRIER set allowing merges on a bypassed queue. This, for example, leads to the following crash during elevator switch. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] cfq_allow_merge+0x49/0xa0 PGD 112cbc067 PUD 115d5c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP CPU 1 Modules linked in: deadline_iosched Pid: 819, comm: dd Not tainted 3.3.0-rc2-work+ #76 Bochs Bochs RIP: 0010:[] [] cfq_allow_merge+0x49/0xa0 RSP: 0018:ffff8801143a38f8 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffff88011817ce28 RCX: ffff880116eb6cc0 RDX: 0000000000000000 RSI: ffff880118056e20 RDI: ffff8801199512f8 RBP: ffff8801143a3908 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff880118195708 R13: ffff880118052aa0 R14: ffff8801143a3d50 R15: ffff880118195708 FS: 00007f19f82cb700(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000112c6a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process dd (pid: 819, threadinfo ffff8801143a2000, task ffff880116eb6cc0) Stack: ffff88011817ce28 ffff880118195708 ffff8801143a3928 ffffffff81391bba ffff88011817ce28 ffff880118195708 ffff8801143a3948 ffffffff81391bf1 ffff88011817ce28 0000000000000000 ffff8801143a39a8 ffffffff81398e3e Call Trace: [] elv_rq_merge_ok+0x4a/0x60 [] elv_try_merge+0x21/0x40 [] blk_queue_bio+0x8e/0x390 [] generic_make_request+0xca/0x100 [] submit_bio+0x74/0x100 [] __blockdev_direct_IO+0x1ce2/0x3450 [] blkdev_direct_IO+0x57/0x60 [] generic_file_aio_read+0x6d5/0x760 [] do_sync_read+0xe2/0x120 [] vfs_read+0xc5/0x180 [] sys_read+0x51/0x90 [] system_call_fastpath+0x16/0x1b There are multiple ways to fix this including making plug merge check ELVPRIV; however, * Calling into elevator outside queue lock is confusing and error-prone. * Requests on plug list aren't known to the elevator. They aren't on the elevator yet, so there's no elevator specific state to update. * Given the nature of plug merges - collecting bio's for the same purpose from the same issuer - elevator specific restrictions aren't applicable. So, simply don't call into elevator methods from plug merge by moving elv_bio_merged() from bio_attempt_*_merge() to blk_queue_bio(), and using blk_try_merge() in attempt_plug_merge(). This is based on Jens' patch to skip elevator_allow_merge_fn() from plug merge. Note that this makes per-cgroup merged stats skip plug merging. Signed-off-by: Tejun Heo LKML-Reference: <4F16F3CA.90904@kernel.dk> Original-patch-by: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-core.c | 19 +++++++++---------- block/cfq-iosched.c | 15 ++++----------- include/linux/elevator.h | 6 ------ 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index fa697bf691eb..3a78b00edd71 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1212,7 +1212,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); return true; } @@ -1243,7 +1242,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); return true; } @@ -1257,13 +1255,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, * on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * - * This function is called without @q->queue_lock; however, elevator is - * accessed iff there already are requests on the plugged list which in - * turn guarantees validity of the elevator. - * - * Note that, on successful merge, elevator operation - * elevator_bio_merged_fn() will be called without queue lock. Elevator - * must be ready for this. + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. */ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1282,7 +1279,7 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, (*request_count)++; - if (rq->q != q || !elv_rq_merge_ok(rq, bio)) + if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; el_ret = blk_try_merge(rq, bio); @@ -1347,12 +1344,14 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) el_ret = elv_merge(q, &req, bio); if (el_ret == ELEVATOR_BACK_MERGE) { if (bio_attempt_back_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; } } else if (el_ret == ELEVATOR_FRONT_MERGE) { if (bio_attempt_front_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5684df6848bc..d0ba50533668 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1699,18 +1699,11 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, /* * Lookup the cfqq that this bio will be queued with and allow - * merge only if rq is queued there. This function can be called - * from plug merge without queue_lock. In such cases, ioc of @rq - * and %current are guaranteed to be equal. Avoid lookup which - * requires queue_lock by using @rq's cic. + * merge only if rq is queued there. */ - if (current->io_context == RQ_CIC(rq)->icq.ioc) { - cic = RQ_CIC(rq); - } else { - cic = cfq_cic_lookup(cfqd, current->io_context); - if (!cic) - return false; - } + cic = cfq_cic_lookup(cfqd, current->io_context); + if (!cic) + return false; cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); return cfqq == RQ_CFQQ(rq); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index d6dfb65c8885..7d4e0356f329 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -42,12 +42,6 @@ struct elevator_ops elevator_merged_fn *elevator_merged_fn; elevator_merge_req_fn *elevator_merge_req_fn; elevator_allow_merge_fn *elevator_allow_merge_fn; - - /* - * Used for both plugged list and elevator merging and in the - * former case called without queue_lock. Read comment on top of - * attempt_plug_merge() for details. - */ elevator_bio_merged_fn *elevator_bio_merged_fn; elevator_dispatch_fn *elevator_dispatch_fn; From 37b40adf2d1b4a5e51323be73ccf8ddcf3f15dd3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 8 Feb 2012 20:02:03 +0100 Subject: [PATCH 09/16] bsg: fix sysfs link remove warning We create "bsg" link if q->kobj.sd is not NULL, so remove it only when the same condition is true. Fixes: WARNING: at fs/sysfs/inode.c:323 sysfs_hash_and_remove+0x2b/0x77() sysfs: can not remove 'bsg', no directory Call Trace: [] warn_slowpath_common+0x6a/0x7f [] ? sysfs_hash_and_remove+0x2b/0x77 [] warn_slowpath_fmt+0x2b/0x2f [] sysfs_hash_and_remove+0x2b/0x77 [] sysfs_remove_link+0x20/0x23 [] bsg_unregister_queue+0x40/0x6d [] __scsi_remove_device+0x31/0x9d [] scsi_forget_host+0x41/0x52 [] scsi_remove_host+0x71/0xe0 [] quiesce_and_remove_host+0x51/0x83 [usb_storage] [] usb_stor_disconnect+0x18/0x22 [usb_storage] [] usb_unbind_interface+0x4e/0x109 [] __device_release_driver+0x6b/0xa6 [] device_release_driver+0x17/0x22 [] bus_remove_device+0xd6/0xe6 [] device_del+0xf2/0x137 [] usb_disable_device+0x94/0x1a0 Signed-off-by: Stanislaw Gruszka Signed-off-by: Jens Axboe --- block/bsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bsg.c b/block/bsg.c index 4cf703fd98bb..ff64ae3bacee 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -983,7 +983,8 @@ void bsg_unregister_queue(struct request_queue *q) mutex_lock(&bsg_mutex); idr_remove(&bsg_minor_idr, bcd->minor); - sysfs_remove_link(&q->kobj, "bsg"); + if (q->kobj.sd) + sysfs_remove_link(&q->kobj, "bsg"); device_unregister(bcd->class_dev); bcd->class_dev = NULL; kref_put(&bcd->ref, bsg_kref_release_function); From cdccaa9467b982d57b139818d15e1e994feca372 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2012 20:03:14 +0100 Subject: [PATCH 10/16] cdrom: move shared static to cdrom_device_info The keeplocked variable in the cdrom driver is shared across multiple drives, but set in per-device ioctls. Move it to the per-device struct, avoiding that the setting on one drive affects the driver's behavior when closing another. [ Impact: limit udev's confusion to one drive when a CD burning program unlocks the CD door at the end of burning. ] Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 12 +++++------- include/linux/cdrom.h | 3 ++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 8fefe59f52a7..d620b4495745 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -286,8 +286,6 @@ /* used to tell the module to turn on full debugging messages */ static bool debug; -/* used to keep tray locked at all times */ -static int keeplocked; /* default compatibility mode */ static bool autoclose=1; static bool autoeject; @@ -1204,7 +1202,7 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) cdinfo(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", cdi->name); cdrom_dvd_rw_close_write(cdi); - if ((cdo->capability & CDC_LOCK) && !keeplocked) { + if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) { cdinfo(CD_CLOSE, "Unlocking door!\n"); cdo->lock_door(cdi, 0); } @@ -1371,7 +1369,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) curslot = info->hdr.curslot; kfree(info); - if (cdi->use_count > 1 || keeplocked) { + if (cdi->use_count > 1 || cdi->keeplocked) { if (slot == CDSL_CURRENT) { return curslot; } else { @@ -2289,7 +2287,7 @@ static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) if (!CDROM_CAN(CDC_OPEN_TRAY)) return -ENOSYS; - if (cdi->use_count != 1 || keeplocked) + if (cdi->use_count != 1 || cdi->keeplocked) return -EBUSY; if (CDROM_CAN(CDC_LOCK)) { int ret = cdi->ops->lock_door(cdi, 0); @@ -2316,7 +2314,7 @@ static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi, if (!CDROM_CAN(CDC_OPEN_TRAY)) return -ENOSYS; - if (keeplocked) + if (cdi->keeplocked) return -EBUSY; cdi->options &= ~(CDO_AUTO_CLOSE | CDO_AUTO_EJECT); @@ -2447,7 +2445,7 @@ static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi, if (!CDROM_CAN(CDC_LOCK)) return -EDRIVE_CANT_DO_THIS; - keeplocked = arg ? 1 : 0; + cdi->keeplocked = arg ? 1 : 0; /* * Don't unlock the door on multiple opens by default, but allow diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 35eae4b67503..7c48029dffe6 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -952,7 +952,8 @@ struct cdrom_device_info { char name[20]; /* name of the device type */ /* per-device flags */ __u8 sanyo_slot : 2; /* Sanyo 3 CD changer support */ - __u8 reserved : 6; /* not used yet */ + __u8 keeplocked : 1; /* CDROM_LOCKDOOR status */ + __u8 reserved : 5; /* not used yet */ int cdda_method; /* see flags */ __u8 last_sense; __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ From 3f9a5aabd0a9fe0e0cd308506f48963d79169aa7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 8 Feb 2012 20:03:38 +0100 Subject: [PATCH 11/16] floppy: Cleanup disk->queue before caling put_disk() if add_disk() was never called add_disk() takes gendisk reference on request queue. If driver failed during initialization and never called add_disk() then that extra reference is not taken. That reference is put in put_disk(). floppy driver allocates the disk, allocates queue, sets disk->queue and then relizes that floppy controller is not present. It tries to tear down everything and tries to put a reference down in put_disk() which was never taken. In such error cases cleanup disk->queue before calling put_disk() so that we never try to put down a reference which was never taken in first place. Reported-and-tested-by: Suresh Jayaraman Tested-by: Dirk Gouders Signed-off-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 510fb10ec45a..401ba7833212 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4368,8 +4368,14 @@ out_unreg_blkdev: out_put_disk: while (dr--) { del_timer_sync(&motor_off_timer[dr]); - if (disks[dr]->queue) + if (disks[dr]->queue) { blk_cleanup_queue(disks[dr]->queue); + /* + * put_disk() is not paired with add_disk() and + * will put queue reference one extra time. fix it. + */ + disks[dr]->queue = NULL; + } put_disk(disks[dr]); } return err; From 4609dff6b5d11e1ed5ff935e15f9f6022acb312b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 8 Feb 2012 20:03:39 +0100 Subject: [PATCH 12/16] floppy: Fix a crash during rmmod floppy driver does not call add_disk() on all the drives hence we don't take gendisk reference on request queue for these drives. Don't call put_disk() with disk->queue set, otherwise we try to put the reference we never took. Reported-and-tested-by: Dirk Gouders Signed-off-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 401ba7833212..9baf11e86362 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4585,6 +4585,15 @@ static void __exit floppy_module_exit(void) platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); + + /* + * These disks have not called add_disk(). Don't put down + * queue reference in put_disk(). + */ + if (!(allowed_drive_mask & (1 << drive)) || + fdc_state[FDC(drive)].version == FDC_NONE) + disks[drive]->queue = NULL; + put_disk(disks[drive]); } From 5abebfdd02450fa1349daacf242e70b3736581e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 8 Feb 2012 22:07:18 +0100 Subject: [PATCH 13/16] bio: don't overflow in bio_get_nr_vecs() There were two places bio_get_nr_vecs() could overflow: First, it did a left shift to convert from sectors to bytes immediately before dividing by PAGE_SIZE. If PAGE_SIZE ever was less than 512 a great many things would break, so dividing by PAGE_SIZE >> 9 is safe and will generate smaller code too. The nastier overflow was in the DIV_ROUND_UP() (that's what the code was effectively doing, anyways). If n + d overflowed, the whole thing would return 0 which breaks things rather effectively. bio_get_nr_vecs() doesn't claim to give an exact value anyways, so the DIV_ROUND_UP() is silly; we could do a straight divide except if a device's queue_max_sectors was less than PAGE_SIZE we'd return 0. So we just add 1; this should always be safe - things will break badly if bio_get_nr_vecs() returns > BIO_MAX_PAGES (bio_alloc() will suddenly start failing) but it's queue_max_segments that must guard against this, if queue_max_sectors is preventing this from happen things are going to explode on architectures with different PAGE_SIZE. Signed-off-by: Kent Overstreet Cc: Tejun Heo Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index b1fe82cf88cf..b980ecde026a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > queue_max_segments(q)) - nr_pages = queue_max_segments(q); - - return nr_pages; + return min_t(unsigned, + queue_max_segments(q), + queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); } EXPORT_SYMBOL(bio_get_nr_vecs); From 306df0716aa285d378cc948fafed53c6ed895fe6 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 8 Feb 2012 22:07:19 +0100 Subject: [PATCH 14/16] loop: zero fill bio instead of return -EIO for partial read commit 8268f5a741 ("deny partial write for loop dev fd") tried to fix the loop device partial read information leak problem. But it changed the semantics of read behavior. When we read beyond the end of the device we should get 0 bytes, which is normal behavior, we should not just return -EIO Instead of returning -EIO, zero out the bio to avoid information leak in case of partail read. Signed-off-by: Dave Young Reviewed-by: Jeff Moyer Tested-by: Jeff Moyer Cc: Dmitry Monakhov Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/loop.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f00257782fcc..cd504353b278 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -356,14 +356,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) return __splice_from_pipe(pipe, sd, lo_splice_actor); } -static int +static ssize_t do_lo_receive(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos) { struct lo_read_data cookie; struct splice_desc sd; struct file *file; - long retval; + ssize_t retval; cookie.lo = lo; cookie.page = bvec->bv_page; @@ -379,26 +379,28 @@ do_lo_receive(struct loop_device *lo, file = lo->lo_backing_file; retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); - if (retval < 0) - return retval; - if (retval != bvec->bv_len) - return -EIO; - return 0; + return retval; } static int lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) { struct bio_vec *bvec; - int i, ret = 0; + ssize_t s; + int i; bio_for_each_segment(bvec, bio, i) { - ret = do_lo_receive(lo, bvec, bsize, pos); - if (ret < 0) + s = do_lo_receive(lo, bvec, bsize, pos); + if (s < 0) + return s; + + if (s != bvec->bv_len) { + zero_fill_bio(bio); break; + } pos += bvec->bv_len; } - return ret; + return 0; } static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: [PATCH 15/16] relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2d..ab56a1764d4d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) From d8c66c5d59247e25a69428aced0b79d33b9c66d6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 11 Feb 2012 12:37:25 +0100 Subject: [PATCH 16/16] block: fix lockdep warning on io_context release put_io_context() 11a3122f6c "block: strip out locking optimization in put_io_context()" removed ioc_lock depth lockdep annoation along with locking optimization; however, while recursing from put_io_context() is no longer possible, ioc_release_fn() may still end up putting the last reference of another ioc through elevator, which wlil grab ioc->lock triggering spurious (as the ioc is always different one) A-A deadlock warning. As this can only happen one time from ioc_release_fn(), using non-zero subclass from ioc_release_fn() is enough. Use subclass 1. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-ioc.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9884fd7427fe..8b782a63c297 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -80,8 +80,15 @@ static void ioc_release_fn(struct work_struct *work) struct io_context *ioc = container_of(work, struct io_context, release_work); struct request_queue *last_q = NULL; + unsigned long flags; - spin_lock_irq(&ioc->lock); + /* + * Exiting icq may call into put_io_context() through elevator + * which will trigger lockdep warning. The ioc's are guaranteed to + * be different, use a different locking subclass here. Use + * irqsave variant as there's no spin_lock_irq_nested(). + */ + spin_lock_irqsave_nested(&ioc->lock, flags, 1); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -103,15 +110,15 @@ static void ioc_release_fn(struct work_struct *work) */ if (last_q) { spin_unlock(last_q->queue_lock); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); blk_put_queue(last_q); } else { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); } last_q = this_q; - spin_lock_irq(this_q->queue_lock); - spin_lock(&ioc->lock); + spin_lock_irqsave(this_q->queue_lock, flags); + spin_lock_nested(&ioc->lock, 1); continue; } ioc_exit_icq(icq); @@ -119,10 +126,10 @@ static void ioc_release_fn(struct work_struct *work) if (last_q) { spin_unlock(last_q->queue_lock); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); blk_put_queue(last_q); } else { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); } kmem_cache_free(iocontext_cachep, ioc);