From 06447ae5e33bfbc5a777cc06d9854a31f3912833 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 14 Jul 2021 21:56:55 +0200 Subject: [PATCH 001/129] ioprio: move user space relevant ioprio bits to UAPI includes systemd added a modified copy of include/linux/ioprio.h into its code to get the relevant content definitions for the exposed ioprio_[get|set] system calls. Move the user space relevant ioprio bits to the UAPI includes to be able to use the ioprio_[get|set] syscalls as intended. Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20210714195655.181943-1-socketcan@hartkopp.net Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 41 +-------------------------------- include/uapi/linux/ioprio.h | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 40 deletions(-) create mode 100644 include/uapi/linux/ioprio.h diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index e9bfe6972aed..ef9ad4fb245f 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -6,46 +6,7 @@ #include #include -/* - * Gives us 8 prio classes with 13-bits of data for each class - */ -#define IOPRIO_CLASS_SHIFT (13) -#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) - -#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) - -#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) - -/* - * These are the io priority groups as implemented by CFQ. RT is the realtime - * class, it always gets premium service. BE is the best-effort scheduling - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. - */ -enum { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, - IOPRIO_CLASS_IDLE, -}; - -/* - * 8 best effort priority levels are supported - */ -#define IOPRIO_BE_NR (8) - -enum { - IOPRIO_WHO_PROCESS = 1, - IOPRIO_WHO_PGRP, - IOPRIO_WHO_USER, -}; - -/* - * Fallback BE priority - */ -#define IOPRIO_NORM (4) +#include /* * if process has set io priority explicitly, use that. if not, convert diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h new file mode 100644 index 000000000000..77b17e08b0da --- /dev/null +++ b/include/uapi/linux/ioprio.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IOPRIO_H +#define _UAPI_LINUX_IOPRIO_H + +/* + * Gives us 8 prio classes with 13-bits of data for each class + */ +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) + +#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) +#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) +#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) + +/* + * These are the io priority groups as implemented by CFQ. RT is the realtime + * class, it always gets premium service. BE is the best-effort scheduling + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +enum { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, + IOPRIO_CLASS_IDLE, +}; + +#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) + +/* + * 8 best effort priority levels are supported + */ +#define IOPRIO_BE_NR (8) + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +/* + * Fallback BE priority + */ +#define IOPRIO_NORM (4) + +#endif /* _UAPI_LINUX_IOPRIO_H */ From 4c7251e1b576d884046e62d23505e75486f88c1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:32 +0200 Subject: [PATCH 002/129] MIPS: don't include in There is no need to include genhd.h from a random arch header, and not doing so prevents the possibility for nasty include loops. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/mips/include/asm/mach-rc32434/rb.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/include/asm/mach-rc32434/rb.h b/arch/mips/include/asm/mach-rc32434/rb.h index d502673a4f6c..34d179ca020b 100644 --- a/arch/mips/include/asm/mach-rc32434/rb.h +++ b/arch/mips/include/asm/mach-rc32434/rb.h @@ -7,8 +7,6 @@ #ifndef __ASM_RC32434_RB_H #define __ASM_RC32434_RB_H -#include - #define REGBASE 0x18000000 #define IDT434_REG_BASE ((volatile void *) KSEG1ADDR(REGBASE)) #define UART0BASE 0x58000 From e45cef51dba9765a6e1df1be724f3d26323512c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:33 +0200 Subject: [PATCH 003/129] bvec: fix the include guards for bvec.h Fix the include guards to match the file naming. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ff832e698efb..883faf5f1523 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -4,8 +4,8 @@ * * Copyright (C) 2001 Ming Lei */ -#ifndef __LINUX_BVEC_ITER_H -#define __LINUX_BVEC_ITER_H +#ifndef __LINUX_BVEC_H +#define __LINUX_BVEC_H #include #include @@ -183,4 +183,4 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } -#endif /* __LINUX_BVEC_ITER_H */ +#endif /* __LINUX_BVEC_H */ From e6e7471706dc42cbe0e01278540c0730138d43e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:34 +0200 Subject: [PATCH 004/129] bvec: add a bvec_kmap_local helper Add a helper to call kmap_local_page on a bvec. There is no need for an unmap helper given that kunmap_local accept any address in the mapped page. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 883faf5f1523..f8710af18eef 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,6 +7,7 @@ #ifndef __LINUX_BVEC_H #define __LINUX_BVEC_H +#include #include #include #include @@ -183,4 +184,16 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } +/** + * bvec_kmap_local - map a bvec into the kernel virtual address space + * @bvec: bvec to map + * + * Must be called on single-page bvecs only. Call kunmap_local on the returned + * address to unmap. + */ +static inline void *bvec_kmap_local(struct bio_vec *bvec) +{ + return kmap_local_page(bvec->bv_page) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ From f93a181af40b159aabea2ccf1a0496e9280be2d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:35 +0200 Subject: [PATCH 005/129] bvec: add memcpy_{from,to}_bvec and memzero_bvec helper Add helpers to perform common memory operation on a bvec. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f8710af18eef..f9fa43b940ff 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -196,4 +196,37 @@ static inline void *bvec_kmap_local(struct bio_vec *bvec) return kmap_local_page(bvec->bv_page) + bvec->bv_offset; } +/** + * memcpy_from_bvec - copy data from a bvec + * @bvec: bvec to copy from + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec) +{ + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + +/** + * memcpy_to_bvec - copy data to a bvec + * @bvec: bvec to copy to + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from) +{ + memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len); +} + +/** + * memzero_bvec - zero all data in a bvec + * @bvec: bvec to zero + * + * Must be called on single-page bvecs only. + */ +static inline void memzero_bvec(struct bio_vec *bvec) +{ + memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + #endif /* __LINUX_BVEC_H */ From ab6c340eeac426fb649ddb4f23b7c752f0092204 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:36 +0200 Subject: [PATCH 006/129] block: use memzero_page in zero_fill_bio Use memzero_bvec to zero each segment in the bio instead of manually mapping and zeroing the data. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..2e436bccb1e2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -495,16 +495,11 @@ EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + bio_for_each_segment(bv, bio, iter) + memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio); From 732022b86a37e816718786ce0b2cebc2b1739fa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:37 +0200 Subject: [PATCH 007/129] rbd: use memzero_bvec Use memzero_bvec instead of reimplementing it. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 90b947c96402..6d596c2c2cd6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1219,24 +1219,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.size = 0; } -static void zero_bvec(struct bio_vec *bv) -{ - void *buf; - unsigned long flags; - - buf = bvec_kmap_irq(bv, &flags); - memset(buf, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); - bvec_kunmap_irq(buf, &flags); -} - static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) { struct ceph_bio_iter it = *bio_pos; ceph_bio_iter_advance(&it, off); ceph_bio_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } @@ -1246,7 +1235,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) ceph_bvec_iter_advance(&it, off); ceph_bvec_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } From 18a6234ccf0661401f07b6316a25d4adbba1d4bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:38 +0200 Subject: [PATCH 008/129] dm-writecache: use bvec_kmap_local instead of bvec_kmap_irq There is no need to disable interrupts in bio_copy_block, and the local only mappings helps to avoid any sort of problems with stray writes into the bio data. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-writecache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e21e29e81bbf..3d2cf811ec3e 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1214,14 +1214,13 @@ static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; - unsigned long flags; unsigned size; int rw = bio_data_dir(bio); unsigned remaining_size = wc->block_size; do { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); - buf = bvec_kmap_irq(&bv, &flags); + buf = bvec_kmap_local(&bv); size = bv.bv_len; if (unlikely(size > remaining_size)) size = remaining_size; @@ -1239,7 +1238,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data memcpy_flushcache_optimized(data, buf, size); } - bvec_kunmap_irq(buf, &flags); + kunmap_local(buf); data = (char *)data + size; remaining_size -= size; From 6e0a48552b8cfc3767b98e3e8beed3f4cbafc9f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:39 +0200 Subject: [PATCH 009/129] ps3disk: use memcpy_{from,to}_bvec Use the bvec helpers instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Tested-by: Geoff Levand Link: https://lore.kernel.org/r/20210727055646.118787-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index f374ea2c67ce..8d51efbe045d 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -83,26 +83,12 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, unsigned int offset = 0; struct req_iterator iter; struct bio_vec bvec; - unsigned int i = 0; - size_t size; - void *buf; rq_for_each_segment(bvec, req, iter) { - unsigned long flags; - dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %llu\n", - __func__, __LINE__, i, bio_sectors(iter.bio), - iter.bio->bi_iter.bi_sector); - - size = bvec.bv_len; - buf = bvec_kmap_irq(&bvec, &flags); if (gather) - memcpy(dev->bounce_buf+offset, buf, size); + memcpy_from_bvec(dev->bounce_buf + offset, &bvec); else - memcpy(buf, dev->bounce_buf+offset, size); - offset += size; - flush_kernel_dcache_page(bvec.bv_page); - bvec_kunmap_irq(buf, &flags); - i++; + memcpy_to_bvec(&bvec, dev->bounce_buf + offset); } } From bda135d9c03fae64c910a8c8d751eccd8408f400 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:40 +0200 Subject: [PATCH 010/129] block: remove bvec_kmap_irq and bvec_kunmap_irq These two helpers are entirely unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 2203b686e1f0..7b5f65a81f2b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -5,7 +5,6 @@ #ifndef __LINUX_BIO_H #define __LINUX_BIO_H -#include #include #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ @@ -519,47 +518,6 @@ static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ -#ifdef CONFIG_HIGHMEM -/* - * remember never ever reenable interrupts between a bvec_kmap_irq and - * bvec_kunmap_irq! - */ -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - unsigned long addr; - - /* - * might not be a highmem page, but the preempt/irq count - * balancing is a lot nicer this way - */ - local_irq_save(*flags); - addr = (unsigned long) kmap_atomic(bvec->bv_page); - - BUG_ON(addr & ~PAGE_MASK); - - return (char *) addr + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr); - local_irq_restore(*flags); -} - -#else -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - return page_address(bvec->bv_page) + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - *flags = 0; -} -#endif - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * From f8b679a070c536600c64a78c83b96aa617f8fa71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:41 +0200 Subject: [PATCH 011/129] block: rewrite bio_copy_data_iter to use bvec_kmap_local and memcpy_to_bvec Use the proper helpers instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-11-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/block/bio.c b/block/bio.c index 2e436bccb1e2..0c89fa2f7a85 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1186,27 +1186,15 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf; - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + src_buf = bvec_kmap_local(&src_bv); + memcpy_to_bvec(&dst_bv, src_buf); + kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); From f434cdc78e01e40fcfb8ef7e6752e3e405b84b58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:42 +0200 Subject: [PATCH 012/129] block: use memcpy_to_bvec in copy_to_high_bio_irq Use memcpy_to_bvec instead of opencoding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-12-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 94081e013c58..7e9e666c04dc 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -67,18 +67,6 @@ static __init int init_emergency_pool(void) __initcall(init_emergency_pool); -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -86,7 +74,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -104,11 +91,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } From d24920e20ca66780d4059e2ece9f858cbae02310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:43 +0200 Subject: [PATCH 013/129] block: use memcpy_from_bvec in bio_copy_kern_endio_read Use memcpy_from_bvec instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..d1448aaad980 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -400,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } From 4aebe8596ab77b0b7125e3584ed0259c4657a06d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:44 +0200 Subject: [PATCH 014/129] block: use memcpy_from_bvec in __blk_queue_bounce Rewrite the actual bounce buffering loop in __blk_queue_bounce to that the memcpy_to_bvec helper can be used to perform the data copies. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-14-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 7e9e666c04dc..05fc7148489d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -239,24 +239,19 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (!PageHighMem(page)) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } trace_block_bio_bounce(*bio_orig); From 8aec120a9ca80c14ce002505cea1e1639f8e9ea5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:45 +0200 Subject: [PATCH 015/129] block: use bvec_kmap_local in t10_pi_type1_{prepare,complete} Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-15-hch@lst.de Signed-off-by: Jens Axboe --- block/t10-pi.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a41..00c203b2a921 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } From 503469b5b30f76169c6302d1469e69a2fb67faf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:46 +0200 Subject: [PATCH 016/129] block: use bvec_kmap_local in bio_integrity_process Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-16-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4b4eb8964a6f..8f54d49dc500 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -172,18 +172,16 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.prot_buf = prot_buf; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } From a45e43cad798173b41e0d6f119784826d3ead02c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:55 +0200 Subject: [PATCH 017/129] block: assert the locking state in delete_partition Add a lockdep assert instead of the outdated locking comment. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-3-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 4230d4f71879..9902b1635b7d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,12 +281,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -/* - * Must be called either with open_mutex held, before a disk can be opened or - * after all disk users are gone. - */ static void delete_partition(struct block_device *part) { + lockdep_assert_held(&part->bd_disk->open_mutex); + fsync_bdev(part); __invalidate_device(part, true); From d7a66574b34e0b354442140927f9b787efccabfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:56 +0200 Subject: [PATCH 018/129] block: unhash the whole device inode earlier Unhash the whole device inode early in del_gendisk. This allows to remove the first GENHD_FL_UP check in the open path as we simply won't find a just removed inode. The second non-racy check after taking open_mutex is still kept. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210722075402.983367-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 +------ fs/block_dev.c | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 298ee78c1bda..716f5ca479ad 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,7 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); mutex_lock(&disk->open_mutex); + remove_inode_hash(disk->part0->bd_inode); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); @@ -592,12 +593,6 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(disk->part0->bd_inode); - set_capacity(disk, 0); if (!(disk->flags & GENHD_FL_HIDDEN)) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..932f4034ad66 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1340,7 +1340,7 @@ struct block_device *blkdev_get_no_open(dev_t dev) disk = bdev->bd_disk; if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + if (disk->flags & GENHD_FL_HIDDEN) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; From 0468c5323413c6903e4cbcef841a55e6c5578cd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:57 +0200 Subject: [PATCH 019/129] block: allocate bd_meta_info later in add_partitions Move the allocation of bd_meta_info after initializing the struct device to avoid the special bdput error handling path. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210722075402.983367-5-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 9902b1635b7d..09c58a110a89 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -356,13 +356,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - if (info) { - err = -ENOMEM; - bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); - if (!bdev->bd_meta_info) - goto out_bdput; - } - pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) @@ -386,6 +379,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -415,9 +415,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; -out_bdput: - bdput(bdev); - return ERR_PTR(err); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); From 9d3b8813895d737fcef4ec8df518f67e5cc381b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:58 +0200 Subject: [PATCH 020/129] block: change the refcounting for partitions Instead of acquiring an inode reference on open make sure partitions always hold device model references to the disk while alive, and switch open to grab only a device model reference to the opened block device. If that is a partition the disk reference is transitively held by the partition already. Link: https://lore.kernel.org/r/20210722075402.983367-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 9 ++++++- fs/block_dev.c | 60 ++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 38 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 09c58a110a89..4f7a1a9cd544 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -261,6 +261,7 @@ static void part_release(struct device *dev) { if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); + put_disk(dev_to_bdev(dev)->bd_disk); bdput(dev_to_bdev(dev)); } @@ -349,9 +350,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); + + err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) - return ERR_PTR(-ENOMEM); + goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); @@ -420,6 +425,8 @@ out_del: device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 932f4034ad66..4a6c8c0a3bc9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,16 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -1282,16 +1272,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1306,9 +1294,7 @@ done: return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1307,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if (disk->flags & GENHD_FL_HIDDEN) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** From 4b2731226d7de4302e4d8766c86e3a21c56dc3b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:00 +0200 Subject: [PATCH 021/129] loop: don't grab a reference to the block device The whole device block device won't be removed while the disk is still alive, so don't bother to grab a reference to it. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f0cdff0c5fbf..a03ef5025f27 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1304,10 +1304,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); @@ -1398,7 +1394,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); if (bdev) { - bdput(bdev); invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; } From 14cf1dbb55bb07427babee425fd2a8a9300737cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:01 +0200 Subject: [PATCH 022/129] block: remove bdgrab All callers are gone, and no one should grab a pure inode reference to a block device anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210722075402.983367-9-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 15 --------------- include/linux/blkdev.h | 1 - 2 files changed, 16 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a6c8c0a3bc9..4f2c4e9e84f5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,21 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3afea47ade6..eb1289a58917 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); From 2f4731dcd0bb73379fbb9e3eb07ae7324125caef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:02 +0200 Subject: [PATCH 023/129] block: remove bdput Now that we've stopped using inode references for anything meaninful in the block layer get rid of the helper to put it and just open code the call to iput on the block_device inode. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- block/partitions/core.c | 2 +- fs/block_dev.c | 6 ------ include/linux/blkdev.h | 1 - 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 716f5ca479ad..5dbb99b57b33 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1076,7 +1076,7 @@ static void disk_release(struct device *dev) xa_destroy(&disk->part_tbl); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - bdput(disk->part0); /* frees the disk */ + iput(disk->part0->bd_inode); /* frees the disk */ } struct class block_class = { .name = "block", @@ -1261,7 +1261,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); - bdput(disk->part0); + iput(disk->part0->bd_inode); out_free_disk: kfree(disk); return NULL; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4f7a1a9cd544..2415bffc2771 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -262,7 +262,7 @@ static void part_release(struct device *dev) if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); - bdput(dev_to_bdev(dev)); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4f2c4e9e84f5..6658f40ae492 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -934,12 +934,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eb1289a58917..b5c033cf5f26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); From 26e2d7a362f6a83146ea3eaa8f17ca9ce35388d3 Mon Sep 17 00:00:00 2001 From: Abd-Alrhman Masalkhi Date: Tue, 27 Jul 2021 08:25:13 +0200 Subject: [PATCH 024/129] block: reduce stack usage in diskstats_show MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have compiled the kernel with a cross compiler "hppa-linux-gnu-" v9.3.0 on x86-64 host machine. I got the following warning: block/genhd.c: In function ‘diskstats_show’: block/genhd.c:1227:1: warning: the frame size of 1688 bytes is larger than 1280 bytes [-Wframe-larger-than=] 1227 | } By Reduced the stack footprint by using the %pg printk specifier instead of disk_name to remove the need for the on-stack buffer. Signed-off-by: Abd-Alrhman Masalkhi Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 5dbb99b57b33..cf705cf95440 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1111,7 +1111,6 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; - char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; unsigned long idx; @@ -1134,15 +1133,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) else inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), - disk_name(gp, hd->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], From a9e7bc3de4051d037a8e6f2d30448c347263737e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:14 +0200 Subject: [PATCH 025/129] block: use the %pg format specifier in printk_all_partitions Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index cf705cf95440..770f21b4fd1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -678,7 +678,6 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct block_device *part; - char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; unsigned long idx; @@ -698,11 +697,10 @@ void __init printk_all_partitions(void) xa_for_each(&disk->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - printk("%s%s %10llu %s %s", + printk("%s%s %10llu %pg %s", bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, - disk_name(disk, part->bd_partno, name_buf), + bdev_nr_sectors(part) >> 1, part, part->bd_meta_info ? part->bd_meta_info->uuid : ""); if (bdev_is_partition(part)) From a291bb43e5c9fdedc4be3dfd496e64e7c5a78b1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:15 +0200 Subject: [PATCH 026/129] block: use the %pg format specifier in show_partition Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 770f21b4fd1a..6ed58fda2c05 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -778,7 +778,6 @@ static int show_partition(struct seq_file *seqf, void *v) struct gendisk *sgp = v; struct block_device *part; unsigned long idx; - char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (!disk_max_parts(sgp) && @@ -791,10 +790,9 @@ static int show_partition(struct seq_file *seqf, void *v) xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - seq_printf(seqf, "%4d %7d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), - bdev_nr_sectors(part) >> 1, - disk_name(sgp, part->bd_partno, buf)); + bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; From 453b8ab696b32cfd8bad80a5501937440d1cf214 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:16 +0200 Subject: [PATCH 027/129] block: simplify printing the device names disk_stack_limits Printk ->disk_name directly for the disk and use the %pg format specifier for the block device, which is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 902c40d67120..109012719aa0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -661,15 +661,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; - - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); blk_queue_update_readahead(disk->queue); } From 1d7035478f64c040441c9cb2aa32e0d7fae526d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:17 +0200 Subject: [PATCH 028/129] block: simplify disk name formatting in check_partition disk_name for partition 0 just copies out the disk_name field. Replace the call to disk_name with a %s format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 2415bffc2771..fb3a556cacce 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -136,7 +136,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state->pp_buf[0] = '\0'; state->bdev = hd->part0; - disk_name(hd, 0, state->name); + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); From abd2864a3e46368a58f3718491521779099bfc14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:18 +0200 Subject: [PATCH 029/129] block: remove disk_name() Remove the disk_name function now that all users are gone. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index 4b885c0f6708..56f33fbcde59 100644 --- a/block/blk.h +++ b/block/blk.h @@ -344,7 +344,6 @@ static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); -char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 diff --git a/block/genhd.c b/block/genhd.c index 6ed58fda2c05..38f053074159 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -78,11 +78,17 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. + * Format the device name of the indicated block device into the supplied buffer + * and return a pointer to that same buffer for convenience. + * + * Note: do not use this in new code, use the %pg specifier to sprintf and + * printk insted. */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +const char *bdevname(struct block_device *bdev, char *buf) { + struct gendisk *hd = bdev->bd_disk; + int partno = bdev->bd_partno; + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) @@ -92,11 +98,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) return buf; } - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_partno, buf); -} EXPORT_SYMBOL(bdevname); static void part_stat_read_all(struct block_device *part, From 2164877c7f373e14e55fca20b7c4a9c436fe4462 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jul 2021 07:37:56 +0200 Subject: [PATCH 030/129] block: remove cmdline-parser.c cmdline-parser.c is only used by the cmdline faux partition format, so merge the code into that and avoid an indirect call. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210728053756.409654-1-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/configs/stmark2_defconfig | 1 - block/Kconfig | 10 -- block/Makefile | 1 - block/cmdline-parser.c | 255 -------------------------- block/partitions/Kconfig | 1 - block/partitions/cmdline.c | 269 +++++++++++++++++++++++++++- include/linux/cmdline-parser.h | 46 ----- 7 files changed, 263 insertions(+), 320 deletions(-) delete mode 100644 block/cmdline-parser.c delete mode 100644 include/linux/cmdline-parser.h diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig index d92306472fce..8898ae321779 100644 --- a/arch/m68k/configs/stmark2_defconfig +++ b/arch/m68k/configs/stmark2_defconfig @@ -22,7 +22,6 @@ CONFIG_RAMSIZE=0x8000000 CONFIG_VECTORBASE=0x40000000 CONFIG_KERNELBASE=0x40001000 # CONFIG_BLK_DEV_BSG is not set -CONFIG_BLK_CMDLINE_PARSER=y CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y CONFIG_BINFMT_MISC=y diff --git a/block/Kconfig b/block/Kconfig index fd732aede922..15dfb7660645 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -114,16 +114,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help diff --git a/block/Makefile b/block/Makefile index bfbe4e13ca1e..c72592b4cf31 100644 --- a/block/Makefile +++ b/block/Makefile @@ -28,7 +28,6 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c deleted file mode 100644 index f2a14571882b..000000000000 --- a/block/cmdline-parser.c +++ /dev/null @@ -1,255 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Parse command line, get partition information - * - * Written by Cai Zhiyong - * - */ -#include -#include - -static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) -{ - int ret = 0; - struct cmdline_subpart *new_subpart; - - *subpart = NULL; - - new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); - if (!new_subpart) - return -ENOMEM; - - if (*partdef == '-') { - new_subpart->size = (sector_t)(~0ULL); - partdef++; - } else { - new_subpart->size = (sector_t)memparse(partdef, &partdef); - if (new_subpart->size < (sector_t)PAGE_SIZE) { - pr_warn("cmdline partition size is invalid."); - ret = -EINVAL; - goto fail; - } - } - - if (*partdef == '@') { - partdef++; - new_subpart->from = (sector_t)memparse(partdef, &partdef); - } else { - new_subpart->from = (sector_t)(~0ULL); - } - - if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); - - if (!next) { - pr_warn("cmdline partition format is invalid."); - ret = -EINVAL; - goto fail; - } - - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; - - partdef = ++next; - } else - new_subpart->name[0] = '\0'; - - new_subpart->flags = 0; - - if (!strncmp(partdef, "ro", 2)) { - new_subpart->flags |= PF_RDONLY; - partdef += 2; - } - - if (!strncmp(partdef, "lk", 2)) { - new_subpart->flags |= PF_POWERUP_LOCK; - partdef += 2; - } - - *subpart = new_subpart; - return 0; -fail: - kfree(new_subpart); - return ret; -} - -static void free_subpart(struct cmdline_parts *parts) -{ - struct cmdline_subpart *subpart; - - while (parts->subpart) { - subpart = parts->subpart; - parts->subpart = subpart->next_subpart; - kfree(subpart); - } -} - -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) -{ - int ret = -EINVAL; - char *next; - int length; - struct cmdline_subpart **next_subpart; - struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; - - *parts = NULL; - - newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); - if (!newparts) - return -ENOMEM; - - next = strchr(bdevdef, ':'); - if (!next) { - pr_warn("cmdline partition has no block device."); - goto fail; - } - - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; - newparts->nr_subparts = 0; - - next_subpart = &newparts->subpart; - - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strncpy(buf, bdevdef, length); - buf[length] = '\0'; - - ret = parse_subpart(next_subpart, buf); - if (ret) - goto fail; - - newparts->nr_subparts++; - next_subpart = &(*next_subpart)->next_subpart; - } - - if (!newparts->subpart) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - *parts = newparts; - - return 0; -fail: - free_subpart(newparts); - kfree(newparts); - return ret; -} - -void cmdline_parts_free(struct cmdline_parts **parts) -{ - struct cmdline_parts *next_parts; - - while (*parts) { - next_parts = (*parts)->next_parts; - free_subpart(*parts); - kfree(*parts); - *parts = next_parts; - } -} -EXPORT_SYMBOL(cmdline_parts_free); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) -{ - int ret; - char *buf; - char *pbuf; - char *next; - struct cmdline_parts **next_parts; - - *parts = NULL; - - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - next_parts = parts; - - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); - if (ret) - goto fail; - - if (next) - pbuf = ++next; - - next_parts = &(*next_parts)->next_parts; - } - - if (!*parts) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - ret = 0; -done: - kfree(buf); - return ret; - -fail: - cmdline_parts_free(parts); - goto done; -} -EXPORT_SYMBOL(cmdline_parts_parse); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev) -{ - while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) - parts = parts->next_parts; - return parts; -} -EXPORT_SYMBOL(cmdline_parts_find); - -/* - * add_part() - * 0 success. - * 1 can not add so many partitions. - */ -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param) -{ - sector_t from = 0; - struct cmdline_subpart *subpart; - - for (subpart = parts->subpart; subpart; - subpart = subpart->next_subpart, slot++) { - if (subpart->from == (sector_t)(~0ULL)) - subpart->from = from; - else - from = subpart->from; - - if (from >= disk_size) - break; - - if (subpart->size > (disk_size - from)) - subpart->size = disk_size - from; - - from += subpart->size; - - if (add_part(slot, subpart, param)) - break; - } - - return slot; -} -EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669e5..278593b8e4e9 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -264,7 +264,6 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cde4..482a29e95dbd 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,248 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ - -#include - +#include +#include +#include #include "check.h" + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -147,7 +404,7 @@ int cmdline_partition(struct parsed_partitions *state) disk_size = get_capacity(state->bdev->bd_disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h deleted file mode 100644 index 68a541807bdf..000000000000 --- a/include/linux/cmdline-parser.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Parsing command line, get the partitions information. - * - * Written by Cai Zhiyong - * - */ -#ifndef CMDLINEPARSEH -#define CMDLINEPARSEH - -#include -#include -#include - -/* partition flags */ -#define PF_RDONLY 0x01 /* Device is read only */ -#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ - -struct cmdline_subpart { - char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ - sector_t from; - sector_t size; - int flags; - struct cmdline_subpart *next_subpart; -}; - -struct cmdline_parts { - char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ - unsigned int nr_subparts; - struct cmdline_subpart *subpart; - struct cmdline_parts *next_parts; -}; - -void cmdline_parts_free(struct cmdline_parts **parts); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev); - -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param); - -#endif /* CMDLINEPARSEH */ From cf179948554a2e0d2b622317bf6bf33138ac36e5 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:25 +0200 Subject: [PATCH 031/129] block: add disk sequence number Associating uevents with block devices in userspace is difficult and racy: the uevent netlink socket is lossy, and on slow and overloaded systems has a very high latency. Block devices do not have exclusive owners in userspace, any process can set one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 can be reused again and again). A userspace process setting up a block device and watching for its events cannot thus reliably tell whether an event relates to the device it just set up or another earlier instance with the same name. Being able to set a UUID on a loop device would solve the race conditions. But it does not allow to derive orderings from uevents: if you see a uevent with a UUID that does not match the device you are waiting for, you cannot tell whether it's because the right uevent has not arrived yet, or it was already sent and you missed it. So you cannot tell whether you should wait for it or not. Associating a unique, monotonically increasing sequential number to the lifetime of each block device, which can be retrieved with an ioctl immediately upon setting it up, allows to solve the race conditions with uevents, and also allows userspace processes to know whether they should wait for the uevent they need or if it was dropped and thus they should move on. Additionally, increment the disk sequence number when the media change, i.e. on DISK_EVENT_MEDIA_CHANGE event. Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-2-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 3 +++ block/genhd.c | 24 ++++++++++++++++++++++++ include/linux/genhd.h | 2 ++ 3 files changed, 29 insertions(+) diff --git a/block/disk-events.c b/block/disk-events.c index a75931ff5da4..04c52f3992ed 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -190,6 +190,9 @@ static void disk_check_events(struct disk_events *ev, spin_unlock_irq(&ev->lock); + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); + /* * Tell userland about new events. Only the events listed in * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT diff --git a/block/genhd.c b/block/genhd.c index 38f053074159..ceb08af72c1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -29,6 +29,23 @@ static struct kobject *block_depr; +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. + */ +static atomic64_t diskseq; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); @@ -1252,6 +1269,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + return disk; out_destroy_part_tbl: @@ -1352,3 +1371,8 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); + +void inc_diskseq(struct gendisk *disk) +{ + disk->diskseq = atomic64_inc_return(&diskseq); +} diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13b34177cc85..140c028845af 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -172,6 +172,7 @@ struct gendisk { int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; + u64 diskseq; }; /* @@ -332,6 +333,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, #endif /* CONFIG_SYSFS */ dev_t part_devt(struct gendisk *disk, u8 partno); +void inc_diskseq(struct gendisk *disk); dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK From 87eb710747126ca6606f064deef93d045486ebbe Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:26 +0200 Subject: [PATCH 032/129] block: export the diskseq in uevents Export the newly introduced diskseq in uevents: $ udevadm info /sys/class/block/* |grep -e DEVNAME -e DISKSEQ E: DEVNAME=/dev/loop0 E: DISKSEQ=1 E: DEVNAME=/dev/loop1 E: DISKSEQ=2 E: DEVNAME=/dev/loop2 E: DISKSEQ=3 E: DEVNAME=/dev/loop3 E: DISKSEQ=4 E: DEVNAME=/dev/loop4 E: DISKSEQ=5 E: DEVNAME=/dev/loop5 E: DISKSEQ=6 E: DEVNAME=/dev/loop6 E: DISKSEQ=7 E: DEVNAME=/dev/loop7 E: DISKSEQ=8 E: DEVNAME=/dev/nvme0n1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p2 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p3 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p4 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p5 E: DISKSEQ=9 E: DEVNAME=/dev/sda E: DISKSEQ=10 E: DEVNAME=/dev/sda1 E: DISKSEQ=10 E: DEVNAME=/dev/sda2 E: DISKSEQ=10 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-3-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/genhd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index ceb08af72c1a..e1b2f898d790 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1092,8 +1092,17 @@ static void disk_release(struct device *dev) blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } + +static int block_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); +} + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; static char *block_devnode(struct device *dev, umode_t *mode, From 7957d93bf32bc211415827e44fdd9cdf1388df59 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:27 +0200 Subject: [PATCH 033/129] block: add ioctl to read the disk sequence number Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number from the genhd structure. # ./getdiskseq /dev/loop* /dev/loop0: 13 /dev/loop0p1: 13 /dev/loop0p2: 13 /dev/loop0p3: 13 /dev/loop1: 14 /dev/loop1p1: 14 /dev/loop1p2: 14 /dev/loop2: 5 /dev/loop3: 6 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-4-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/ioctl.c | 2 ++ include/uapi/linux/fs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index 24beec9ca9c9..0c3a4a53fa11 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -469,6 +469,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 4c32e97dcdf0..bdf7b404b3e7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -184,6 +184,7 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +#define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) From 13927b31b13f3c6556221eff3487247bd3c7a245 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:28 +0200 Subject: [PATCH 034/129] block: export diskseq in sysfs Add a new sysfs handle to export the new diskseq value. Place it in /block//diskseq and document it. $ grep . /sys/class/block/*/diskseq /sys/class/block/loop0/diskseq:13 /sys/class/block/loop1/diskseq:14 /sys/class/block/loop2/diskseq:5 /sys/class/block/loop3/diskseq:6 /sys/class/block/ram0/diskseq:1 /sys/class/block/ram1/diskseq:2 /sys/class/block/vda/diskseq:7 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-5-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 12 ++++++++++++ block/genhd.c | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index e34cdeeeb9d4..a0ed87386639 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -28,6 +28,18 @@ Description: For more details refer Documentation/admin-guide/iostats.rst +What: /sys/block//diskseq +Date: February 2021 +Contact: Matteo Croce +Description: + The /sys/block//diskseq files reports the disk + sequence number, which is a monotonically increasing + number assigned to every drive. + Some devices, like the loop device, refresh such number + every time the backing file is changed. + The value type is 64 bit unsigned. + + What: /sys/block///stat Date: February 2008 Contact: Jerome Marchand diff --git a/block/genhd.c b/block/genhd.c index e1b2f898d790..a4817e42f3a3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -977,6 +977,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -989,6 +997,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1034,6 +1043,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif From e6138dc12de9df17cbda9c40314d69592855ac5e Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:29 +0200 Subject: [PATCH 035/129] block: add a helper to raise a media changed event Refactor disk_check_events() and move some code into disk_event_uevent(). Then add disk_force_media_change(), a helper which will be used by devices to force issuing a DISK_EVENT_MEDIA_CHANGE event. Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-6-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 61 ++++++++++++++++++++++++++++++++----------- include/linux/genhd.h | 1 + 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/block/disk-events.c b/block/disk-events.c index 04c52f3992ed..7445b8ff2775 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); } +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; - int nr_events = 0, i; /* check events */ events = disk->fops->check_events(disk, clearing); @@ -193,19 +209,8 @@ static void disk_check_events(struct disk_events *ev, if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); } /** @@ -284,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev) } EXPORT_SYMBOL(bdev_check_media_change); +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * @events: the events to raise + * + * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, + * attempt to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + * + * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. + */ +bool disk_force_media_change(struct gendisk *disk, unsigned int events) +{ + disk_event_uevent(disk, events); + + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(disk->part0, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 140c028845af..849486de81c6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -237,6 +237,7 @@ extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); bool set_capacity_and_notify(struct gendisk *disk, sector_t size); +bool disk_force_media_change(struct gendisk *disk, unsigned int events); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; From 9f65c489b68d42427dc0651488dd260d678f525d Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:30 +0200 Subject: [PATCH 036/129] loop: raise media_change event Make the loop device raise a DISK_MEDIA_CHANGE event on attach or detach. # udevadm monitor -up |grep -e DISK_MEDIA_CHANGE -e DEVNAME & # losetup -f zero [ 7.454235] loop0: detected capacity change from 0 to 16384 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop0 DEVNAME=/dev/loop0 DEVNAME=/dev/loop0 # losetup -f zero [ 10.205245] loop1: detected capacity change from 0 to 16384 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop1 # losetup -f zero2 [ 13.532368] loop2: detected capacity change from 0 to 40960 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop2 DEVNAME=/dev/loop2 # losetup -D DEVNAME=/dev/loop1 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop2 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop2 DEVNAME=/dev/loop0 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop0 Signed-off-by: Matteo Croce Reviewed-by: Christoph Hellwig Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-7-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a03ef5025f27..f8486d9b75a4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -774,6 +774,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_err; /* and ... switch */ + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; @@ -1257,6 +1258,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, goto out_unlock; } + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); @@ -1410,6 +1412,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { @@ -2386,6 +2389,8 @@ static int loop_add(int i) disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); add_disk(disk); mutex_unlock(&loop_ctl_mutex); From 2bc1f6e442eec88fa60f1ee6bef2c9871227cf8a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 13 Jul 2021 17:18:37 +0900 Subject: [PATCH 037/129] block: remove blk-mq-sysfs dead code In block/blk-mq-sysfs.c, struct blk_mq_ctx_sysfs_entry is not used to define any attribute since the "mq" sysfs directory contains only sub-directories (no attribute files). As a result, blk_mq_sysfs_show(), blk_mq_sysfs_store(), and struct sysfs_ops blk_mq_sysfs_ops are all unused and unnecessary. Remove all this unused code. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210713081837.524422-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 55 -------------------------------------------- 1 file changed, 55 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 7b52e7657b2d..253c857cba47 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_ctx_sysfs_release, }; From 90b7198001f23ea37d3b46dc631bdaa2357a20b1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 5 Aug 2021 10:41:59 -0700 Subject: [PATCH 038/129] blk-mq: Introduce the BLK_MQ_F_NO_SCHED_BY_DEFAULT flag elevator_get_default() uses the following algorithm to select an I/O scheduler from inside add_disk(): - In case of a single hardware queue or if sharing hardware queues across multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline. - Otherwise, use 'none'. This is a good choice for most but not for all block drivers. Make it possible to override the selection of mq-deadline with a new flag, namely BLK_MQ_F_NO_SCHED_BY_DEFAULT. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/elevator.c | 3 +++ include/linux/blk-mq.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 52ada14cfe45..d0295e68f481 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -630,6 +630,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1 && !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1d18447ebebc..22215db36122 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -404,7 +404,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, From 2112f5c1330a671fa852051d85cb9eadc05d7eb7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 5 Aug 2021 10:42:00 -0700 Subject: [PATCH 039/129] loop: Select I/O scheduler 'none' from inside add_disk() We noticed that the user interface of Android devices becomes very slow under memory pressure. This is because Android uses the zram driver on top of the loop driver for swapping, because under memory pressure the swap code alternates reads and writes quickly, because mq-deadline is the default scheduler for loop devices and because mq-deadline delays writes by five seconds for such a workload with default settings. Fix this by making the kernel select I/O scheduler 'none' from inside add_disk() for loop devices. This default can be overridden at any time from user space, e.g. via a udev rule. This approach has an advantage compared to changing the I/O scheduler from userspace from 'mq-deadline' into 'none', namely that synchronize_rcu() does not get called. This patch changes the default I/O scheduler for loop devices from 'mq-deadline' into 'none'. Additionally, this patch reduces the Android boot time on my test setup with 0.5 seconds compared to configuring the loop I/O scheduler from user space. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8486d9b75a4..fa1c298a8cfb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2333,7 +2333,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); From c66fd019713e9cf7d6f1243c378cd177d01fe18a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:40 +0200 Subject: [PATCH 040/129] block: make the block holder code optional Move the block holder code into a separate file as it is not in any way related to the other block_dev.c code, and add a new selectable config option for it so that we don't have to build it without any remapped drivers selected. The Kconfig symbol contains a _DEPRECATED suffix to match the comments added in commit 49731baa41df ("block: restore multiple bd_link_disk_holder() support"). Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++ block/Makefile | 1 + block/holder.c | 139 ++++++++++++++++++++++++++++++++++++ drivers/md/Kconfig | 2 + drivers/md/bcache/Kconfig | 1 + fs/block_dev.c | 144 +------------------------------------- include/linux/blk_types.h | 2 +- include/linux/genhd.h | 4 +- 8 files changed, 151 insertions(+), 146 deletions(-) create mode 100644 block/holder.c diff --git a/block/Kconfig b/block/Kconfig index 15dfb7660645..bac87d773c54 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -241,4 +241,8 @@ config BLK_MQ_RDMA config BLK_PM def_bool BLOCK && PM +# do not use in new code +config BLOCK_HOLDER_DEPRECATED + bool + source "block/Kconfig.iosched" diff --git a/block/Makefile b/block/Makefile index c72592b4cf31..0d951adce796 100644 --- a/block/Makefile +++ b/block/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 000000000000..904a1dcd5c12 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&bdev->bd_disk->open_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_holder_dir); + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free: + kfree(holder); +out_unlock: + mutex_unlock(&bdev->bd_disk->open_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&bdev->bd_disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&bdev->bd_disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0602e82a9516..f821dae101a9 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" + select BLOCK_HOLDER_DEPRECATED if SYSFS help This driver lets you combine several hard disk partitions into one logical block device. This can be used to simply append one @@ -201,6 +202,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" + select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN depends on DAX || DAX=n help diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d1ca4d059c20..cf3e8096942a 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -2,6 +2,7 @@ config BCACHE tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 help Allows a block device to be used as cache for other devices; uses diff --git a/fs/block_dev.c b/fs/block_dev.c index 6658f40ae492..ae9651cad923 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -902,7 +902,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif bdev->bd_stats = alloc_percpu(struct disk_stats); @@ -1063,148 +1063,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 290f9061b29a..7a4e139d24ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,7 +34,7 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head bd_holder_disks; #endif struct kobject *bd_holder_dir; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 849486de81c6..e21a91c16a79 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -318,7 +318,7 @@ void set_capacity(struct gendisk *disk, sector_t size); int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); #else @@ -331,7 +331,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); From fbd9a39542ecdd2ade55869c13856b2590db3df8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:41 +0200 Subject: [PATCH 041/129] block: remove the extra kobject reference in bd_link_disk_holder Since commit 0d02129e76ed ("block: merge struct block_device and struct hd_struct") there is no way for the bdev to go away as long as there is a holder, so remove the extra references. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-3-hch@lst.de Signed-off-by: Jens Axboe --- block/holder.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/holder.c b/block/holder.c index 904a1dcd5c12..960654a71342 100644 --- a/block/holder.c +++ b/block/holder.c @@ -92,11 +92,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -130,7 +125,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } From 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:42 +0200 Subject: [PATCH 042/129] block: look up holders by bdev Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804094147.459763-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 +++- block/holder.c | 18 +++++++++--------- fs/block_dev.c | 3 --- include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index a4817e42f3a3..cd4eab744667 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1289,7 +1289,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; out_destroy_part_tbl: diff --git a/block/holder.c b/block/holder.c index 960654a71342..11e65d99a9fb 100644 --- a/block/holder.c +++ b/block/holder.c @@ -3,7 +3,7 @@ struct bd_holder_disk { struct list_head list; - struct gendisk *disk; + struct block_device *bdev; int refcnt; }; @@ -12,8 +12,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder; - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) return holder; return NULL; } @@ -61,7 +61,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -82,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list); - holder->disk = disk; + holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -93,7 +93,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (ret) goto out_del; - list_add(&holder->list, &bdev->bd_holder_disks); + list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del: @@ -101,7 +101,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -120,7 +120,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -128,6 +128,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) list_del_init(&holder->list); kfree(holder); } - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/fs/block_dev.c b/fs/block_dev.c index ae9651cad923..cc801767a377 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -902,9 +902,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7a4e139d24ef..e92735655684 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,9 +34,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - struct list_head bd_holder_disks; -#endif struct kobject *bd_holder_dir; u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e21a91c16a79..0721807d76ee 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,7 +159,9 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct kobject *slave_dir; - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + struct list_head slave_bdevs; +#endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; From d626338735909bc2b2e7cafc332f44ed41cfdeee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:43 +0200 Subject: [PATCH 043/129] block: support delayed holder registration device mapper needs to register holders before it is ready to do I/O. Currently it does so by registering the disk early, which can leave the disk and queue in a weird half state where the queue is registered with the disk, except for sysfs and the elevator. And this state has been a bit promlematic before, and will get more so when sorting out the responsibilities between the queue and the disk. Support registering holders on an initialized but not registered disk instead by delaying the sysfs registration until the disk is registered. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++++ block/holder.c | 68 ++++++++++++++++++++++++++++++++----------- include/linux/genhd.h | 5 ++++ 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index cd4eab744667..db916f779077 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -447,6 +447,16 @@ static void register_disk(struct device *parent, struct gendisk *disk, kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + if (disk->flags & GENHD_FL_HIDDEN) return; diff --git a/block/holder.c b/block/holder.c index 11e65d99a9fb..4568cc4f6827 100644 --- a/block/holder.c +++ b/block/holder.c @@ -28,6 +28,19 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); } +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + return ret; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + return ret; +} + /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -66,7 +79,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + if (WARN_ON(!bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -84,28 +97,28 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) INIT_LIST_HEAD(&holder->list); holder->bdev = bdev; holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } list_add(&holder->list, &disk->slave_bdevs); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); +static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); +} + /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -123,11 +136,32 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); + +int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret; + + mutex_lock(&disk->open_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&disk->open_mutex); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&disk->open_mutex); + return ret; +} diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0721807d76ee..80952f038d79 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -323,6 +323,7 @@ long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -333,6 +334,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); From 74a2b6ec9380959546d95ecc01a8fe6c7157add9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:44 +0200 Subject: [PATCH 044/129] dm: cleanup cleanup_mapped_device md->queue is now always set when md->disk is set, so simplify the conditionals a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2c5f9e585211..7971ec8ce677 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1694,13 +1694,9 @@ static void cleanup_mapped_device(struct mapped_device *md) md->disk->private_data = NULL; spin_unlock(&_minor_lock); del_gendisk(md->disk); - } - - if (md->queue) dm_queue_destroy_keyslot_manager(md->queue); - - if (md->disk) blk_cleanup_disk(md->disk); + } cleanup_srcu_struct(&md->io_barrier); From ba30585936b0b88f0fb2b19be279b346a6cc87eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:45 +0200 Subject: [PATCH 045/129] dm: move setting md->type into dm_setup_md_queue Move setting md->type from both callers into dm_setup_md_queue. This ensures that md->type is only set to a valid value after the queue has been fully setup, something we'll rely on future changes. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-ioctl.c | 4 ---- drivers/md/dm.c | 5 +++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2209cbcd84db..2575074a2204 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1436,9 +1436,6 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si } if (dm_get_md_type(md) == DM_TYPE_NONE) { - /* Initial table load: acquire type of table. */ - dm_set_md_type(md, dm_table_get_type(t)); - /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { @@ -2187,7 +2184,6 @@ int __init dm_early_create(struct dm_ioctl *dmi, if (r) goto err_destroy_table; - md->type = dm_table_get_type(t); /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7971ec8ce677..f003bd5b93ce 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2052,9 +2052,9 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); */ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { - int r; + enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; - enum dm_queue_mode type = dm_get_md_type(md); + int r; switch (type) { case DM_TYPE_REQUEST_BASED: @@ -2081,6 +2081,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r; + md->type = type; blk_register_queue(md->disk); From 89f871af1b26d98d983cba7ed0e86effa45ba5f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:46 +0200 Subject: [PATCH 046/129] dm: delay registering the gendisk device mapper is currently the only outlier that tries to call register_disk after add_disk, leading to fairly inconsistent state of these block layer data structures. Instead change device-mapper to just register the gendisk later now that the holder mechanism can cope with that. Note that this introduces a user visible change: the dm kobject is now only visible after the initial table has been loaded. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 1 - drivers/md/dm.c | 25 ++++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 0dbd48cbdff9..5b95eea517d1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -559,7 +559,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) err = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (err) goto out_tag_set; - elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f003bd5b93ce..7981b7287628 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1693,7 +1693,10 @@ static void cleanup_mapped_device(struct mapped_device *md) spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); - del_gendisk(md->disk); + if (dm_get_md_type(md) != DM_TYPE_NONE) { + dm_sysfs_exit(md); + del_gendisk(md->disk); + } dm_queue_destroy_keyslot_manager(md->queue); blk_cleanup_disk(md->disk); } @@ -1788,7 +1791,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad; } - add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -1989,19 +1991,12 @@ static struct dm_table *__unbind(struct mapped_device *md) */ int dm_create(int minor, struct mapped_device **result) { - int r; struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; - r = dm_sysfs_init(md); - if (r) { - free_dev(md); - return r; - } - *result = md; return 0; } @@ -2081,10 +2076,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r; + + add_disk(md->disk); + + r = dm_sysfs_init(md); + if (r) { + del_gendisk(md->disk); + return r; + } md->type = type; - - blk_register_queue(md->disk); - return 0; } @@ -2190,7 +2190,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", dm_device_name(md), atomic_read(&md->holders)); - dm_sysfs_exit(md); dm_table_destroy(__unbind(md)); free_dev(md); } From d1254a8749711e0d7441036a74ce592341f89697 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:47 +0200 Subject: [PATCH 047/129] block: remove support for delayed queue registrations Now that device mapper has been changed to register the disk once it is fully ready all this code is unused. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-9-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 1 - block/genhd.c | 29 +++++++---------------------- include/linux/genhd.h | 6 ------ 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index d0295e68f481..9beaafd238e0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,6 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } -EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/block/genhd.c b/block/genhd.c index db916f779077..b0b6e0caa389 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -475,20 +475,20 @@ static void register_disk(struct device *parent, struct gendisk *disk, } /** - * __device_add_disk - add disk information to kernel list + * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) + +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { int ret; @@ -498,8 +498,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * elevator if one is needed, that is, for devices requesting queue * registration. */ - if (register_queue) - elevator_init_mq(disk->queue); + elevator_init_mq(disk->queue); /* * If the driver provides an explicit major number it also must provide @@ -553,8 +552,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -568,21 +566,8 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk_add_events(disk); blk_integrity_add(disk); } - -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - -{ - __device_add_disk(parent, disk, groups, true); -} EXPORT_SYMBOL(device_add_disk); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) -{ - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 80952f038d79..473d93c6ebda 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -219,12 +219,6 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk, NULL); } -extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); -static inline void add_disk_no_queue_reg(struct gendisk *disk) -{ - device_add_disk_no_queue_reg(NULL, disk); -} - extern void del_gendisk(struct gendisk *gp); void set_disk_ro(struct gendisk *disk, bool read_only); From 5ed964f8e54eb3191b8b7b45aeb52672a0c995dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:40 +0200 Subject: [PATCH 048/129] mm: hide laptop_mode_wb_timer entirely behind the BDI API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't leak the detaіls of the timer into the block layer, instead initialize the timer in bdi_alloc and delete it in bdi_unregister. Note that this means the timer is initialized (but not armed) for non-block queues as well now. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ----- mm/backing-dev.c | 3 +++ mm/page-writeback.c | 2 -- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 04477697ee4b..5897bc37467d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -394,10 +394,7 @@ void blk_cleanup_queue(struct request_queue *q) /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); - if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -546,8 +543,6 @@ struct request_queue *blk_alloc_queue(int node_id) atomic_set(&q->nr_active_requests_shared_sbitmap, 0); - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..cd06dca232c3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -807,6 +807,7 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -928,6 +929,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + del_timer_sync(&bdi->laptop_mode_wb_timer); + /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..c12f67cbfa19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2010,7 +2010,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = @@ -2045,7 +2044,6 @@ void laptop_sync_completion(void) rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload From 471aa704db4904f7af5a50019ca3b5b018c0cf62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:41 +0200 Subject: [PATCH 049/129] block: pass a gendisk to blk_queue_update_readahead .. and rename the function to disk_update_readahead. This is in preparation for moving the BDI from the request_queue to the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 +++++--- block/blk-sysfs.c | 2 +- drivers/block/drbd/drbd_nl.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/nvme/host/core.c | 2 +- include/linux/blkdev.h | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 109012719aa0..44aaef9bf736 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -380,8 +380,10 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); -void blk_queue_update_readahead(struct request_queue *q) +void disk_update_readahead(struct gendisk *disk) { + struct request_queue *q = disk->queue; + /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. @@ -391,7 +393,7 @@ void blk_queue_update_readahead(struct request_queue *q) q->backing_dev_info->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } -EXPORT_SYMBOL_GPL(blk_queue_update_readahead); +EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device @@ -665,7 +667,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, pr_notice("%s: Warning: Device %pg is misaligned\n", disk->disk_name, bdev); - blk_queue_update_readahead(disk->queue); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 370d83c18057..3af2ab7d5086 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -866,7 +866,7 @@ int blk_register_queue(struct gendisk *disk) "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_update_readahead(q); + disk_update_readahead(disk); ret = blk_trace_init_sysfs(dev); if (ret) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e7d0e637e632..44ccf8b4f4b2 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1364,7 +1364,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { blk_stack_limits(&q->limits, &b->limits, 0); - blk_queue_update_readahead(q); + disk_update_readahead(device->vdisk); } fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0543cdf89e92..b03eabc1ed7c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2076,7 +2076,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_keyslot_manager(q, t); - blk_queue_update_readahead(q); + disk_update_readahead(t->md->disk); return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfd9dec0c1f6..f6c0a59c4b53 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1890,7 +1890,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); - blk_queue_update_readahead(ns->head->disk->queue); + disk_update_readahead(ns->head->disk); blk_mq_unfreeze_queue(ns->head->disk->queue); } return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b5c033cf5f26..ac3642c88a4d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1139,7 +1139,7 @@ void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); -void blk_queue_update_readahead(struct request_queue *q); +void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); From 1008162b2782a3624d12b0aee8da58bc75d12e19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:42 +0200 Subject: [PATCH 050/129] block: add a queue_has_disk helper Add a helper to check if a gendisk is associated with a request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ac3642c88a4d..96f3d9617cd8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -664,6 +664,7 @@ extern void blk_clear_pm_only(struct request_queue *q); dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) +#define queue_has_disk(q) ((q)->kobj.parent != NULL) #define queue_to_disk(q) (dev_to_disk(kobj_to_dev((q)->kobj.parent))) static inline bool queue_is_mq(struct request_queue *q) From edb0872f44ec9976ea6d052cb4b93cd2d23ac2ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:43 +0200 Subject: [PATCH 051/129] block: move the bdi from the request_queue to the gendisk The backing device information only makes sense for file system I/O, and thus belongs into the gendisk and not the lower level request_queue structure. Move it there. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- block/blk-cgroup.c | 7 +++---- block/blk-core.c | 13 +++---------- block/blk-mq.c | 2 +- block/blk-settings.c | 14 +++++++++----- block/blk-sysfs.c | 26 ++++++++++++-------------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 23 ++++++++++++++--------- drivers/block/drbd/drbd_req.c | 5 ++--- drivers/block/pktcdvd.c | 8 +++----- fs/block_dev.c | 4 ++-- fs/fat/fatent.c | 1 + include/linux/blkdev.h | 3 --- include/linux/genhd.h | 1 + 14 files changed, 58 insertions(+), 63 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 727955918563..1576e858d3a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5266,8 +5266,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); + bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 575d7a2e7203..db034e35ae20 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + return NULL; + return bdi_dev_name(queue_to_disk(blkg->q)->bdi); } /** diff --git a/block/blk-core.c b/block/blk-core.c index 5897bc37467d..0874bc2fcdb4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,7 +14,6 @@ */ #include #include -#include #include #include #include @@ -531,13 +530,9 @@ struct request_queue *blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_split; q->node = node_id; @@ -567,7 +562,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; + goto fail_stats; if (blkcg_init_queue(q)) goto fail_ref; @@ -580,10 +575,8 @@ struct request_queue *blk_alloc_queue(int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); + blk_free_queue_stats(q->stats); fail_split: bioset_exit(&q->bio_split); fail_id: diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..d2725f94491d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(queue_to_disk(q)->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 44aaef9bf736..3613d2cc0688 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -140,7 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -388,10 +391,9 @@ void disk_update_readahead(struct gendisk *disk) * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ - q->backing_dev_info->ra_pages = + disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - q->backing_dev_info->io_pages = - queue_max_sectors(q) >> (PAGE_SHIFT - 9); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL_GPL(disk_update_readahead); @@ -473,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - q->backing_dev_info->ra_pages = + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3af2ab7d5086..1832587dce3a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -88,9 +88,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; + if (!queue_has_disk(q)) + return -EINVAL; + ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -98,13 +100,14 @@ static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!queue_has_disk(q)) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -251,7 +254,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (queue_has_disk(q)) + queue_to_disk(q)->bdi->io_pages = + max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -766,13 +771,6 @@ static void blk_exit_queue(struct request_queue *q) * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); } /** diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 3ed71b8da887..31086afaad9c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, - inflight); + trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, + rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index b0b6e0caa389..f8def1129501 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,10 +466,9 @@ static void register_disk(struct device *parent, struct gendisk *disk, dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); + if (disk->bdi->dev) { + err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, + "bdi"); WARN_ON(err); } } @@ -540,15 +539,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; struct device *dev = disk_to_dev(disk); /* Register BDI before referencing it from bdev */ dev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(bdi, "%u:%u", + ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(bdi, dev); + bdi_set_owner(disk->bdi, dev); bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); @@ -615,7 +613,7 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + bdi_unregister(disk->bdi); } blk_unregister_queue(disk); @@ -1088,6 +1086,7 @@ static void disk_release(struct device *dev) might_sleep(); + bdi_put(disk->bdi); if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); @@ -1268,9 +1267,13 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) + goto out_free_disk; + disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) - goto out_free_disk; + goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); @@ -1292,6 +1295,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); out_free_disk: kfree(disk); return NULL; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 13beb98a7c5a..5ca233644d70 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -905,13 +905,12 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, enum drbd_read_balancing rbm) { - struct backing_dev_info *bdi; int stripe_shift; switch (rbm) { case RB_CONGESTED_REMOTE: - bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; - return bdi_read_congested(bdi); + return bdi_read_congested( + device->ldev->backing_bdev->bd_disk->bdi); case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 538446b652de..0f26b2510a75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1183,10 +1183,8 @@ try_next_bio: wakeup = (pd->write_congestion_on > 0 && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); - if (wakeup) { - clear_bdi_congested(pd->disk->queue->backing_dev_info, - BLK_RW_ASYNC); - } + if (wakeup) + clear_bdi_congested(pd->disk->bdi, BLK_RW_ASYNC); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2366,7 +2364,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(q->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(bio->bi_bdev->bd_disk->bdi, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); congestion_wait(BLK_RW_ASYNC, HZ); diff --git a/fs/block_dev.c b/fs/block_dev.c index cc801767a377..43be5463a4c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1087,7 +1087,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) if (!bdev->bd_openers) { set_init_blocksize(bdev); if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); + bdev->bd_bdi = bdi_get(disk->bdi); } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); @@ -1122,7 +1122,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); + part->bd_bdi = bdi_get(disk->bdi); done: part->bd_openers++; return 0; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include #include +#include #include "fat.h" struct fatent_operations { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 96f3d9617cd8..23e1253a8d88 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -398,8 +397,6 @@ struct request_queue { struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; - struct backing_dev_info *backing_dev_info; - /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 473d93c6ebda..b3bab578f03a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -158,6 +158,7 @@ struct gendisk { struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ + struct backing_dev_info *bdi; struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; From a11d7fc2d05fb509cd9e33d4093507d6eda3ad53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:44 +0200 Subject: [PATCH 052/129] block: remove the bd_bdi in struct block_device Just retrieve the bdi from the disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-6-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ++++--- fs/block_dev.c | 13 +------------ fs/nilfs2/super.c | 2 +- fs/super.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/backing-dev.h | 2 +- include/linux/blk_types.h | 1 - 7 files changed, 9 insertions(+), 20 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0c3a4a53fa11..fff161eaab42 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -506,7 +506,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: return blkdev_reread_part(bdev, mode); @@ -556,7 +556,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -628,7 +629,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) diff --git a/fs/block_dev.c b/fs/block_dev.c index 43be5463a4c4..e1c14c2e0504 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -801,7 +801,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -826,16 +825,11 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -1084,11 +1078,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->bdi); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1121,8 +1112,6 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->bdi); done: part->bd_openers++; return 0; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..3ab73567a0f5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -844,7 +844,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..29530859d9ff 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -143,7 +143,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return I_BDEV(inode)->bd_bdi; + return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e92735655684..1335efa8a1db 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -38,7 +38,6 @@ struct block_device { u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; - struct backing_dev_info *bd_bdi; /* The counter of freeze processes */ int bd_fsfreeze_count; From 866663b7b52d2da267b28e12eed89ee781b8fed1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 29 Jul 2021 11:42:26 +0800 Subject: [PATCH 053/129] block: return ELEVATOR_DISCARD_MERGE if possible When merging one bio to request, if they are discard IO and the queue supports multi-range discard, we need to return ELEVATOR_DISCARD_MERGE because both block core and related drivers(nvme, virtio-blk) doesn't handle mixed discard io merge(traditional IO merge together with discard merge) well. Fix the issue by returning ELEVATOR_DISCARD_MERGE in this situation, so both blk-mq and drivers just need to handle multi-range discard. Reported-by: Oleksandr Natalenko Signed-off-by: Ming Lei Tested-by: Oleksandr Natalenko Fixes: 2705dfb20947 ("block: fix discard request merge") Link: https://lore.kernel.org/r/20210729034226.1591070-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +++ block/blk-merge.c | 16 ---------------- block/elevator.c | 3 +++ block/mq-deadline-main.c | 2 ++ include/linux/blkdev.h | 16 ++++++++++++++++ 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1576e858d3a5..e4a61eda2d0f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2361,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } diff --git a/block/blk-merge.c b/block/blk-merge.c index a11b3b53717e..f8707ff7e2fc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -705,22 +705,6 @@ static void blk_account_io_merge_request(struct request *req) } } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { diff --git a/block/elevator.c b/block/elevator.c index 9beaafd238e0..ff45d8388f48 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 6f612e6dc82b..294be0c0db65 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -677,6 +677,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 23e1253a8d88..07eef02325b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1519,6 +1519,22 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } +/* + * Two cases of handling DISCARD merge: + * If max_discard_segments > 1, the driver takes every bio + * as a range and send them to controller together. The ranges + * needn't to be contiguous. + * Otherwise, the bios/requests will be handled as same as + * others which should be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); From 99d26de2f6d79badc80f55b54bd90d4cb9d1ad90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 14:39:24 +0200 Subject: [PATCH 054/129] writeback: make the laptop_mode prototypes available unconditionally Fix the !CONFIG_BLOCK build after the recent cleanup. Fixes: 5ed964f8e54e ("mm: hide laptop_mode_wb_timer entirely behind the BDI API") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..270677dc4f36 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -336,14 +336,9 @@ static inline void cgroup_writeback_umount(void) /* * mm/page-writeback.c */ -#ifdef CONFIG_BLOCK void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); -void laptop_mode_sync(struct work_struct *work); void laptop_mode_timer_fn(struct timer_list *t); -#else -static inline void laptop_sync_completion(void) { } -#endif bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK From 018eca456c4b4dca56aaf1ec27f309c74d0fe246 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 21 Jul 2021 10:53:15 +0800 Subject: [PATCH 055/129] block: move some macros to blkdev.h Move them (PAGE_SECTORS_SHIFT, PAGE_SECTORS and SECTOR_MASK) to the generic header file to remove redundancy. Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20210721025315.1729118-1-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/brd.c | 3 --- drivers/block/null_blk/main.c | 4 ---- drivers/md/bcache/util.h | 2 -- include/linux/blkdev.h | 4 ++++ include/linux/device-mapper.h | 1 - 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 95694113e38e..58ec167aa018 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -27,9 +27,6 @@ #include -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) - /* * Each block ramdisk device has a radix_tree brd_pages of pages that stores * the pages containing the block device's contents. A brd page's ->index is diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d734e9ee1546..f128242d1170 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,10 +11,6 @@ #include #include "null_blk.h" -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index bca4a7c97da7..b64460a76267 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ #include "closure.h" -#define PAGE_SECTORS (PAGE_SIZE / 512) - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 07eef02325b4..df404c1fb087 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -939,6 +939,10 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 7457d49acf9a..94f2cd6a8e83 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -151,7 +151,6 @@ typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); -#define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); From 29e6a5e01d0adae52a2859ed39cb9e607430e011 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:21 +0200 Subject: [PATCH 056/129] mmc: block: let device_add_disk create disk attributes Pass the attribute group for the attributes on the gendisk to device_add_disk so that they are created atomically with the disk creation. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20210809064028.1198327-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 102 +++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index ce8aed562929..4ac3e1b93e7e 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -128,8 +128,6 @@ struct mmc_blk_data { * track of the current selected device partition. */ unsigned int part_curr; - struct device_attribute force_ro; - struct device_attribute power_ro_lock; int area_type; /* debugfs files (only in main mmc_blk_data) */ @@ -281,6 +279,9 @@ out_put: return count; } +static DEVICE_ATTR(ro_lock_until_next_power_on, 0, + power_ro_lock_show, power_ro_lock_store); + static ssize_t force_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -313,6 +314,44 @@ out: return ret; } +static DEVICE_ATTR(force_ro, 0644, force_ro_show, force_ro_store); + +static struct attribute *mmc_disk_attrs[] = { + &dev_attr_force_ro.attr, + &dev_attr_ro_lock_until_next_power_on.attr, + NULL, +}; + +static umode_t mmc_disk_attrs_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + umode_t mode = a->mode; + + if (a == &dev_attr_ro_lock_until_next_power_on.attr && + (md->area_type & MMC_BLK_DATA_AREA_BOOT) && + md->queue.card->ext_csd.boot_ro_lockable) { + mode = S_IRUGO; + if (!(md->queue.card->ext_csd.boot_ro_lock & + EXT_CSD_BOOT_WP_B_PWR_WP_DIS)) + mode |= S_IWUSR; + } + + mmc_blk_put(md); + return mode; +} + +static const struct attribute_group mmc_disk_attr_group = { + .is_visible = mmc_disk_attrs_is_visible, + .attrs = mmc_disk_attrs, +}; + +static const struct attribute_group *mmc_disk_attr_groups[] = { + &mmc_disk_attr_group, + NULL, +}; + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -2644,15 +2683,8 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md) * from being accepted. */ card = md->queue.card; - if (md->disk->flags & GENHD_FL_UP) { - device_remove_file(disk_to_dev(md->disk), &md->force_ro); - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) - device_remove_file(disk_to_dev(md->disk), - &md->power_ro_lock); - + if (md->disk->flags & GENHD_FL_UP) del_gendisk(md->disk); - } mmc_cleanup_queue(&md->queue); mmc_blk_put(md); } @@ -2679,51 +2711,6 @@ static void mmc_blk_remove_parts(struct mmc_card *card, } } -static int mmc_add_disk(struct mmc_blk_data *md) -{ - int ret; - struct mmc_card *card = md->queue.card; - - device_add_disk(md->parent, md->disk, NULL); - md->force_ro.show = force_ro_show; - md->force_ro.store = force_ro_store; - sysfs_attr_init(&md->force_ro.attr); - md->force_ro.attr.name = "force_ro"; - md->force_ro.attr.mode = S_IRUGO | S_IWUSR; - ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); - if (ret) - goto force_ro_fail; - - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) { - umode_t mode; - - if (card->ext_csd.boot_ro_lock & EXT_CSD_BOOT_WP_B_PWR_WP_DIS) - mode = S_IRUGO; - else - mode = S_IRUGO | S_IWUSR; - - md->power_ro_lock.show = power_ro_lock_show; - md->power_ro_lock.store = power_ro_lock_store; - sysfs_attr_init(&md->power_ro_lock.attr); - md->power_ro_lock.attr.mode = mode; - md->power_ro_lock.attr.name = - "ro_lock_until_next_power_on"; - ret = device_create_file(disk_to_dev(md->disk), - &md->power_ro_lock); - if (ret) - goto power_ro_lock_fail; - } - return ret; - -power_ro_lock_fail: - device_remove_file(disk_to_dev(md->disk), &md->force_ro); -force_ro_fail: - del_gendisk(md->disk); - - return ret; -} - #ifdef CONFIG_DEBUG_FS static int mmc_dbg_card_status_get(void *data, u64 *val) @@ -2919,12 +2906,13 @@ static int mmc_blk_probe(struct mmc_card *card) dev_set_drvdata(&card->dev, md); - ret = mmc_add_disk(md); + device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); if (ret) goto out; list_for_each_entry(part_md, &md->part, part) { - ret = mmc_add_disk(part_md); + device_add_disk(part_md->parent, part_md->disk, + mmc_disk_attr_groups); if (ret) goto out; } From a94dcfce70d3f4f6cd99f3b43d74305e3a4f3983 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:22 +0200 Subject: [PATCH 057/129] mmc: block: cleanup gendisk creation Restructure mmc_blk_probe to avoid a failure path with a half created disk. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20210809064028.1198327-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 49 ++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 4ac3e1b93e7e..4c11f171e56d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2328,7 +2328,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, sector_t size, bool default_ro, const char *subname, - int area_type) + int area_type, + unsigned int part_type) { struct mmc_blk_data *md; int devidx, ret; @@ -2375,6 +2376,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, kref_init(&md->kref); md->queue.blkdata = md; + md->part_type = part_type; md->disk->major = MMC_BLOCK_MAJOR; md->disk->minors = perdev_minors; @@ -2427,6 +2429,10 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), cap_str, md->read_only ? "(ro)" : ""); + /* used in ->open, must be set before add_disk: */ + if (area_type == MMC_BLK_DATA_AREA_MAIN) + dev_set_drvdata(&card->dev, md); + device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); return md; err_kfree: @@ -2456,7 +2462,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) } return mmc_blk_alloc_req(card, &card->dev, size, false, NULL, - MMC_BLK_DATA_AREA_MAIN); + MMC_BLK_DATA_AREA_MAIN, 0); } static int mmc_blk_alloc_part(struct mmc_card *card, @@ -2470,10 +2476,9 @@ static int mmc_blk_alloc_part(struct mmc_card *card, struct mmc_blk_data *part_md; part_md = mmc_blk_alloc_req(card, disk_to_dev(md->disk), size, default_ro, - subname, area_type); + subname, area_type, part_type); if (IS_ERR(part_md)) return PTR_ERR(part_md); - part_md->part_type = part_type; list_add(&part_md->part, &md->part); return 0; @@ -2674,20 +2679,13 @@ static int mmc_blk_alloc_parts(struct mmc_card *card, struct mmc_blk_data *md) static void mmc_blk_remove_req(struct mmc_blk_data *md) { - struct mmc_card *card; - - if (md) { - /* - * Flush remaining requests and free queues. It - * is freeing the queue that stops new requests - * from being accepted. - */ - card = md->queue.card; - if (md->disk->flags & GENHD_FL_UP) - del_gendisk(md->disk); - mmc_cleanup_queue(&md->queue); - mmc_blk_put(md); - } + /* + * Flush remaining requests and free queues. It is freeing the queue + * that stops new requests from being accepted. + */ + del_gendisk(md->disk); + mmc_cleanup_queue(&md->queue); + mmc_blk_put(md); } static void mmc_blk_remove_parts(struct mmc_card *card, @@ -2876,7 +2874,7 @@ static void mmc_blk_remove_debugfs(struct mmc_card *card, static int mmc_blk_probe(struct mmc_card *card) { - struct mmc_blk_data *md, *part_md; + struct mmc_blk_data *md; int ret = 0; /* @@ -2904,19 +2902,6 @@ static int mmc_blk_probe(struct mmc_card *card) if (ret) goto out; - dev_set_drvdata(&card->dev, md); - - device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); - if (ret) - goto out; - - list_for_each_entry(part_md, &md->part, part) { - device_add_disk(part_md->parent, part_md->disk, - mmc_disk_attr_groups); - if (ret) - goto out; - } - /* Add two debugfs entries */ mmc_blk_add_debugfs(card, md); From 5eba200526ac5fee7659c45b6c23fb2c576f8813 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:23 +0200 Subject: [PATCH 058/129] nvme: remove the GENHD_FL_UP check in nvme_ns_remove Early probe failure never reaches nvme_ns_remove, so GENHD_FL_UP must be set at this point. Remove the check. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f6c0a59c4b53..dbe7144f0026 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3826,14 +3826,12 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk->flags & GENHD_FL_UP) { - if (!nvme_ns_head_multipath(ns->head)) - nvme_cdev_del(&ns->cdev, &ns->cdev_device); - del_gendisk(ns->disk); - blk_cleanup_queue(ns->queue); - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - } + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); + blk_cleanup_queue(ns->queue); + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); down_write(&ns->ctrl->namespaces_rwsem); list_del_init(&ns->list); From 916a470da02f909cabb65337f65438b8bc3965b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:24 +0200 Subject: [PATCH 059/129] nvme: replace the GENHD_FL_UP check in nvme_mpath_shutdown_disk Use the nvme-internal NVME_NSHEAD_DISK_LIVE flag instead of abusing the block layer state. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3f32c5e86bfc..37ce3e8b1db2 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -765,7 +765,7 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) if (!head->disk) return; kblockd_schedule_work(&head->requeue_work); - if (head->disk->flags & GENHD_FL_UP) { + if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); } From 4f9e14aecfbdc6b762d5122489604858c5fec5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:25 +0200 Subject: [PATCH 060/129] sx8: use the internal state machine to check if del_gendisk needs to be called Remove usage of the block layer internal GENHD_FL_UP by just looking at the host state. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 7b54353ee92b..420cd952ddc4 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1373,7 +1373,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no) if (!disk) return; - if (disk->flags & GENHD_FL_UP) + if (host->state > HST_DEV_ACTIVATE) del_gendisk(disk); blk_cleanup_disk(disk); } From 224b0683228c5f332f9cee615d85e75e9a347170 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:26 +0200 Subject: [PATCH 061/129] bcache: add proper error unwinding in bcache_device_init Except for the IDA none of the allocations in bcache_device_init is unwound on error, fix that. Signed-off-by: Christoph Hellwig Acked-by: Coly Li Link: https://lore.kernel.org/r/20210809064028.1198327-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 185246a0d855..d0f08e946453 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -931,20 +931,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -987,8 +987,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } From b75f4aed88febe903bd40a6128b74edd2388417e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:27 +0200 Subject: [PATCH 062/129] bcache: move the del_gendisk call out of bcache_device_free Let the callers call del_gendisk so that we can check if add_disk has been called properly for the cached device case instead of relying on the block layer internal GENHD_FL_UP flag. Signed-off-by: Christoph Hellwig Reviewed-by: Coly Li Link: https://lore.kernel.org/r/20210809064028.1198327-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d0f08e946453..f2874c77ff79 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -885,11 +885,6 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - bool disk_added = (disk->flags & GENHD_FL_UP) != 0; - - if (disk_added) - del_gendisk(disk); - blk_cleanup_disk(disk); ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); @@ -1371,8 +1366,10 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - if (atomic_read(&dc->running)) + if (atomic_read(&dc->running)) { bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + del_gendisk(dc->disk.disk); + } bcache_device_free(&dc->disk); list_del(&dc->list); @@ -1518,6 +1515,7 @@ static void flash_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), &d->c->flash_dev_dirty_sectors); + del_gendisk(d->disk); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); From 50b4aecfbbb09869db967e4a26212a47e10c0088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:28 +0200 Subject: [PATCH 063/129] block: remove GENHD_FL_UP Just check inode_unhashed on the whole device bdev inode instead, and provide a helper to check for that information. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- block/partitions/core.c | 4 ++-- drivers/md/md.h | 4 +--- drivers/nvme/host/core.c | 2 +- fs/block_dev.c | 2 +- include/linux/genhd.h | 9 +++++---- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f8def1129501..9d6b3aeea288 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -77,7 +77,8 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) * initial capacity during probing. */ if (size == capacity || - (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", @@ -527,8 +528,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_EXT_DEVT; } - disk->flags |= GENHD_FL_UP; - disk_alloc_events(disk); if (disk->flags & GENHD_FL_HIDDEN) { @@ -597,7 +596,6 @@ void del_gendisk(struct gendisk *disk) mutex_lock(&disk->open_mutex); remove_inode_hash(disk->part0->bd_inode); - disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); diff --git a/block/partitions/core.c b/block/partitions/core.c index fb3a556cacce..c6738ccbcee5 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -459,7 +459,7 @@ int bdev_add_partition(struct block_device *bdev, int partno, int ret; mutex_lock(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) { + if (!disk_live(disk)) { ret = -ENXIO; goto out; } @@ -669,7 +669,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) lockdep_assert_held(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) return -ENXIO; rescan: diff --git a/drivers/md/md.h b/drivers/md/md.h index 832547cf038f..4c96c36bd01a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -764,9 +764,7 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) { - int flags = rdev->bdev->bd_disk->flags; - - if (!(flags & GENHD_FL_UP)) { + if (!disk_live(rdev->bdev->bd_disk)) { if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) pr_warn("md: %s: %s array has a missing/failed member\n", mdname(rdev->mddev), md_type); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dbe7144f0026..1478d825011d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1822,7 +1822,7 @@ static void nvme_update_disk_info(struct gendisk *disk, static inline bool nvme_first_scan(struct gendisk *disk) { /* nvme_alloc_ns() scans the disk prior to adding it */ - return !(disk->flags & GENHD_FL_UP); + return !disk_live(disk); } static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) diff --git a/fs/block_dev.c b/fs/block_dev.c index e1c14c2e0504..38a8b0e04a0c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1218,7 +1218,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b3bab578f03a..b47e297cd551 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,9 +60,6 @@ struct partition_meta_info { * device. * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. * - * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up", - * with a similar meaning to network interfaces. - * * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include * partition information in ``/proc/partitions`` or in the output of * printk_all_partitions(). @@ -97,7 +94,6 @@ struct partition_meta_info { /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ #define GENHD_FL_CD 0x0008 -#define GENHD_FL_UP 0x0010 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NATIVE_CAPACITY 0x0080 @@ -178,6 +174,11 @@ struct gendisk { u64 diskseq; }; +static inline bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(disk->part0->bd_inode); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. From a08aa9bccdc282b5e8d133bf8c239473f057b464 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:09 +0200 Subject: [PATCH 064/129] block: store a gendisk in struct parsed_partitions Partition scanning only happens on the whole device, so pass a struct gendisk instead of the whole device block_device to the scanners. This allows to simplify printing the device name in various places as the disk name is available in disk->name. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20210810154512.1809898-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 4 ++-- block/partitions/aix.c | 20 ++------------------ block/partitions/amiga.c | 7 +++---- block/partitions/atari.c | 4 ++-- block/partitions/check.h | 2 +- block/partitions/cmdline.c | 6 ++---- block/partitions/core.c | 6 +++--- block/partitions/efi.c | 36 +++++++++++++++++------------------- block/partitions/ibm.c | 4 ++-- block/partitions/ldm.c | 18 +++++++++--------- block/partitions/mac.c | 2 +- block/partitions/msdos.c | 6 ++++-- block/partitions/sgi.c | 5 ++--- block/partitions/sun.c | 5 ++--- 14 files changed, 52 insertions(+), 73 deletions(-) diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958bf..2c381c694c57 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a97..85f4b967565e 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -66,22 +66,6 @@ struct pvd { #define LVM_MAXLVS 256 -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - /** * read_lba(): Read bytes from disk, starting at given LBA * @state @@ -89,7 +73,7 @@ static u64 last_lba(struct block_device *bdev) * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9aed..5c8624e26a54 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state) int start_sect, nr_sects, blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) @@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state) } pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c8522..da5994175416 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67f0..d5b28e309d64 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -9,7 +9,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 482a29e95dbd..1af610f0ba8c 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -380,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -397,12 +396,11 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); diff --git a/block/partitions/core.c b/block/partitions/core.c index c6738ccbcee5..5dd1cd1a163d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -135,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } state->pp_buf[0] = '\0'; - state->bdev = hd->part0; + state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -717,10 +717,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct page *page; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; return NULL; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e2716792ecc1..aaa3dc487cb5 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(disk->part0->bd_inode->i_size, + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ done: * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,13 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -705,7 +703,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +720,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 4b044e620d35..9bca396aef4a 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index cc86534c80ad..a6f0c9eaebe9 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ out: * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b6095335636c..7b521df00a39 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index f5102596a984..b5d5c229cc3b 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -579,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb0515..9cc6b8c1eea4 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf77..ddf9e6def4b2 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } From 7f6be3765e113e0d4b8e6b65e1074982de94377e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:10 +0200 Subject: [PATCH 065/129] block: pass a gendisk to bdev_add_partition bdev_add_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 3 ++- block/partitions/core.c | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk.h b/block/blk.h index 56f33fbcde59..c0486f609978 100644 --- a/block/blk.h +++ b/block/blk.h @@ -347,8 +347,8 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index fff161eaab42..36e0ec76b3b2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,6 +16,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; long long start, length; @@ -40,7 +41,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, /* check if partition is aligned to blocksize */ if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(bdev, p.pno, start, length); default: diff --git a/block/partitions/core.c b/block/partitions/core.c index 5dd1cd1a163d..7b227c114297 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -451,11 +451,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part; - struct gendisk *disk = bdev->bd_disk; int ret; mutex_lock(&disk->open_mutex); From 926fbb1677e0d963dd96dae3c0305e855590d524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:11 +0200 Subject: [PATCH 066/129] block: pass a gendisk to bdev_del_partition bdev_del_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/ioctl.c | 2 +- block/partitions/core.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk.h b/block/blk.h index c0486f609978..21c441eb6773 100644 --- a/block/blk.h +++ b/block/blk.h @@ -349,7 +349,7 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_WHOLEDISK 2 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); +int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index 36e0ec76b3b2..8f57b276b2f1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -31,7 +31,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; diff --git a/block/partitions/core.c b/block/partitions/core.c index 7b227c114297..8c7abf0ee0ea 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -476,13 +476,13 @@ out: return ret; } -int bdev_del_partition(struct block_device *bdev, int partno) +int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -493,7 +493,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } From 3d2e79894bd7adc7d14638a0c72ceb8b722d1fa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:12 +0200 Subject: [PATCH 067/129] block: pass a gendisk to bdev_resize_partition bdev_resize_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 2 +- block/partitions/core.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index 21c441eb6773..db6f82bbb683 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,8 +350,8 @@ void blk_free_ext_minor(unsigned int minor); int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/ioctl.c b/block/ioctl.c index 8f57b276b2f1..eb0491e90b9a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -43,7 +43,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 8c7abf0ee0ea..9265936df77e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -497,14 +497,14 @@ out_unlock: return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -513,14 +513,14 @@ int bdev_resize_partition(struct block_device *bdev, int partno, goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } From 4f1e9630afe6332de7286820fedd019f19eac057 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 2 Aug 2021 11:51:56 +0800 Subject: [PATCH 068/129] blk-throtl: optimize IOPS throttle for large IO scenarios After patch 54efd50 (block: make generic_make_request handle arbitrarily sized bios), the IO through io-throttle may be larger, and these IOs may be further split into more small IOs. However, IOPS throttle does not seem to be aware of this change, which makes the calculation of IOPS of large IOs incomplete, resulting in disk-side IOPS that does not meet expectations. Maybe we should fix this problem. We can reproduce it by set max_sectors_kb of disk to 128, set blkio.write_iops_throttle to 100, run a dd instance inside blkio and use iostat to watch IOPS: dd if=/dev/zero of=/dev/sdb bs=1M count=1000 oflag=direct As a result, without this change the average IOPS is 1995, with this change the IOPS is 98. Signed-off-by: Chunguang Xu Acked-by: Tejun Heo Link: https://lore.kernel.org/r/65869aaad05475797d63b4c3fed4f529febe3c26.1627876014.git.brookxu@tencent.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ block/blk-throttle.c | 32 ++++++++++++++++++++++++++++++++ block/blk.h | 2 ++ 3 files changed, 36 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index f8707ff7e2fc..eeba8422ae82 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; + + blk_throtl_charge_bio_split(*bio); } } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b1b22d863bdf..55c49015e533 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -178,6 +178,9 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -777,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -799,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -1031,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -2052,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2176,6 +2189,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + bool blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; diff --git a/block/blk.h b/block/blk.h index db6f82bbb683..148bdcd3aa08 100644 --- a/block/blk.h +++ b/block/blk.h @@ -293,11 +293,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW From 9451aa0aacaf7ea13d1acfd5de8b63a6e0b24fac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:13 +0200 Subject: [PATCH 069/129] block: free the extended dev_t minor later The dev_t is used as the inode hash, so we should only released it once then block device inode is gone from the inode cache. Move it to bdev_free_inode to ensure that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 -- block/partitions/core.c | 2 -- fs/block_dev.c | 5 +++++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 9d6b3aeea288..ed58ddf6258b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1085,8 +1085,6 @@ static void disk_release(struct device *dev) might_sleep(); bdi_put(disk->bdi); - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/block/partitions/core.c b/block/partitions/core.c index 9265936df77e..58c4c362c94f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -259,8 +259,6 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); iput(dev_to_bdev(dev)->bd_inode); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 38a8b0e04a0c..4bd2a632c79c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include #include #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -813,6 +814,10 @@ static void bdev_free_inode(struct inode *inode) if (!bdev_is_partition(bdev)) kfree(bdev->bd_disk); + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } From 889c05cc5834a1eef2dbe1e639cfd7a81c4f4c6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:14 +0200 Subject: [PATCH 070/129] block: ensure the bdi is freed after inode_detach_wb inode_detach_wb references the "main" bdi of the inode. With the recent change to move the bdi from the request_queue to the gendisk this causes a guaranteed use after free when using certain cgroup configurations. The big itself is older through as any non-default inode reference (e.g. an open file descriptor) could have injected this use after free even before that. Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reported-by: Qian Cai Reported-by: syzbot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 1 - fs/block_dev.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ed58ddf6258b..731a46063132 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1084,7 +1084,6 @@ static void disk_release(struct device *dev) might_sleep(); - bdi_put(disk->bdi); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/fs/block_dev.c b/fs/block_dev.c index 4bd2a632c79c..d3a8062302a0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -812,8 +812,11 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(bdev->bd_dev)); @@ -833,8 +836,6 @@ static void bdev_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); } static const struct super_operations bdev_sops = { From 1113f0b69c6a98ff4e733c306a6658a31f8cbc49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:20 +0200 Subject: [PATCH 071/129] bvec: add a bvec_virt helper Add a helper to get the virtual address for a bvec. This avoids that all callers need to know about the page + offset representation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f9fa43b940ff..0e9bdd42dafb 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -229,4 +229,16 @@ static inline void memzero_bvec(struct bio_vec *bvec) memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); } +/** + * bvec_virt - return the virtual address for a bvec + * @bvec: bvec to return the virtual address for + * + * Note: the caller must ensure that @bvec->bv_page is not a highmem page. + */ +static inline void *bvec_virt(struct bio_vec *bvec) +{ + WARN_ON_ONCE(PageHighMem(bvec->bv_page)); + return page_address(bvec->bv_page) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ From b93ef45350c0119ddc275601438c89231b198414 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:21 +0200 Subject: [PATCH 072/129] block: use bvec_virt in bio_integrity_{process,free} Use the bvec_virt helper to clean up the bio integrity processing a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Acked-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f54d49dc500..6b47cddbbca1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -104,8 +104,7 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -163,13 +162,11 @@ static blk_status_t bio_integrity_process(struct bio *bio, struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); From 1c277e501334238f6c4f57d16d14e7c911550075 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:22 +0200 Subject: [PATCH 073/129] dm: make EBS depend on !HIGHMEM __ebs_rw_bvec use page_address on the submitted bios data, and thus can't deal with highmem. Disable the target on highmem configs. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f821dae101a9..f45fb372e51b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -342,7 +342,7 @@ config DM_WRITECACHE config DM_EBS tristate "Emulated block size target (EXPERIMENTAL)" - depends on BLK_DEV_DM + depends on BLK_DEV_DM && !HIGHMEM select DM_BUFIO help dm-ebs emulates smaller logical block size on backing devices From 3a8ba33bd71a4126b9e799e8d29d6d5da08c93f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:23 +0200 Subject: [PATCH 074/129] dm-ebs: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-ebs-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 71475a2410be..0c509dae0ff8 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -74,7 +74,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv if (unlikely(!bv->bv_page || !bv_len)) return -EIO; - pa = page_address(bv->bv_page) + bv->bv_offset; + pa = bvec_virt(bv); /* Handle overlapping page <-> blocks */ while (bv_len) { From 964cacfdd34cd48e3b5b714c3cc33427001e843f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:24 +0200 Subject: [PATCH 075/129] dm-integrity: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 20f2510db1f6..a9ea361769a7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1819,7 +1819,7 @@ again: unsigned this_len; BUG_ON(PageHighMem(biv.bv_page)); - tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag = bvec_virt(&biv); this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); @@ -2006,7 +2006,7 @@ retry_kmap: unsigned tag_now = min(biv.bv_len, tag_todo); char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); - tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag_addr = bvec_virt(&biv); if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else From fbc27241e537d3a99d0f843a4080e1d2fb014fb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:25 +0200 Subject: [PATCH 076/129] squashfs: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-7-hch@lst.de Signed-off-by: Jens Axboe --- fs/squashfs/block.c | 7 +++---- fs/squashfs/lz4_wrapper.c | 2 +- fs/squashfs/lzo_wrapper.c | 2 +- fs/squashfs/xz_wrapper.c | 2 +- fs/squashfs/zlib_wrapper.c | 2 +- fs/squashfs/zstd_wrapper.c | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; From cf58b537781df6eee2bbeae0463e45acf727978a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:26 +0200 Subject: [PATCH 077/129] rbd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20210804095634.460779-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6d596c2c2cd6..e65c9d706f6f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2986,8 +2986,7 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) }; ceph_bvec_iter_advance_step(&it, bytes, ({ - if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, - bv.bv_len)) + if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len)) return false; })); return true; From 358b348b9197b977276e0f034c474380565879e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:27 +0200 Subject: [PATCH 078/129] virtio_blk: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20210804095634.460779-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4b49df2dfd23..767b4f72a54d 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -166,11 +166,8 @@ static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); - } - + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) + kfree(bvec_virt(&req->special_vec)); blk_mq_end_request(req, virtblk_result(vbr)); } From 2fd3e5efe791946be0957c8e1eed9560b541fe46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:28 +0200 Subject: [PATCH 079/129] bcache: use bvec_virt Use bvec_virt instead of open coding it. Note that the existing code is fine despite ignoring bv_offset as the bio is known to contain exactly one page from the page allocator per bio_vec. Signed-off-by: Christoph Hellwig Reviewed-by: Coly Li Link: https://lore.kernel.org/r/20210804095634.460779-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 183a58c89377..0595559de174 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -378,7 +378,7 @@ static void do_btree_node_write(struct btree *b) struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, b->bio, iter_all) { - memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + memcpy(bvec_virt(bv), addr, PAGE_SIZE); addr += PAGE_SIZE; } From c3c770563510aa66fd8e84b374daf43e236fa4ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:29 +0200 Subject: [PATCH 080/129] sd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b8d55af763f9..5b5b8266e142 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -886,7 +886,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; - buf = page_address(rq->special_vec.bv_page); + buf = bvec_virt(&rq->special_vec); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); From 25d84545beaae8e9427bbd25feff309363cd0a58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:30 +0200 Subject: [PATCH 081/129] ubd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20210804095634.460779-12-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index e497185dd393..cd9dc0556e91 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1268,8 +1268,7 @@ static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, rq_for_each_segment(bvec, req, iter) { BUG_ON(i >= io_req->desc_cnt); - io_req->io_desc[i].buffer = - page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].buffer = bvec_virt(&bvec); io_req->io_desc[i].length = bvec.bv_len; i++; } From 6da525b3ecaea04eaaeb3277f6e16d91ecfdb84a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:31 +0200 Subject: [PATCH 082/129] ps3vram: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 7fbf469651c4..c7b19e128b03 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -541,7 +541,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, bio_for_each_segment(bvec, bio, iter) { /* PS3 is ppc64, so we don't handle highmem */ - char *ptr = page_address(bvec.bv_page) + bvec.bv_offset; + char *ptr = bvec_virt(&bvec); size_t len = bvec.bv_len, retlen; dev_dbg(&dev->core, " %s %zu bytes at offset %llu\n", op, From bf5fb875b494b32ef81fdfa5530a79fc22486254 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:32 +0200 Subject: [PATCH 083/129] dasd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20210804095634.460779-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_diag.c | 2 +- drivers/s390/block/dasd_eckd.c | 14 +++++++------- drivers/s390/block/dasd_fba.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 6bb775236c16..db5987281010 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -552,7 +552,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, dbio = dreq->bio; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { memset(dbio, 0, sizeof (struct dasd_diag_bio)); dbio->type = rw_cmd; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 0de1a463c509..8610ea414322 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3267,7 +3267,7 @@ static int dasd_eckd_ese_read(struct dasd_ccw_req *cqr, struct irb *irb) end_blk = (curr_trk + 1) * recs_per_trk; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { if (first_blk + blk_count >= end_blk) { cqr->proc_bytes = blk_count * blksize; @@ -3999,7 +3999,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( last_rec - recid + 1, cmd, basedev, blksize); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -4166,7 +4166,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( idaw_dst = NULL; idaw_len = 0; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4509,7 +4509,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( new_track = 1; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4542,7 +4542,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( } } else { rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); last_tidaw = itcw_add_tidaw(itcw, 0x00, dst, bv.bv_len); if (IS_ERR(last_tidaw)) { @@ -4778,7 +4778,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; if (cmd == DASD_ECKD_CCW_READ_TRACK) memset(dst, 0, seg_len); @@ -4839,7 +4839,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->uses_cdl == 0 || recid > 2*blk_per_trk) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->uses_cdl && recid <= 2*blk_per_trk) diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 3ad319aee51e..e084f4dedddd 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -501,7 +501,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp_regular( } recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -583,7 +583,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->rdc_data.mode.bits.data_chain != 0) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->rdc_data.mode.bits.data_chain == 0) From 2b7a8112212afa90f36391e3ab7df531614bfb6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:33 +0200 Subject: [PATCH 084/129] dcssblk: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dcssblk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 29180bdf0977..5be3d1c39a78 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -892,8 +892,7 @@ dcssblk_submit_bio(struct bio *bio) index = (bio->bi_iter.bi_sector >> 3); bio_for_each_segment(bvec, bio, iter) { - page_addr = (unsigned long) - page_address(bvec.bv_page) + bvec.bv_offset; + page_addr = (unsigned long)bvec_virt(&bvec); source_addr = dev_info->start + (index<<12) + bytes_done; if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0) // More paranoia. From 3973e15fa5342783ce0009ab3a423ae9b811fc63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:34 +0200 Subject: [PATCH 085/129] nvme: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20210804095634.460779-16-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1478d825011d..9e51266c9e2c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -968,12 +968,11 @@ void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; - struct page *page = req->special_vec.bv_page; - if (page == ctrl->discard_page) + if (req->special_vec.bv_page == ctrl->discard_page) clear_bit_unlock(0, &ctrl->discard_page_busy); else - kfree(page_address(page) + req->special_vec.bv_offset); + kfree(bvec_virt(&req->special_vec)); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); From 49cb5168a7c6abf9835f9acdce6263bc2deefeb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:22 +0200 Subject: [PATCH 086/129] blk-cgroup: refactor blkcg_print_stat Factor out a helper to deal with a single blkcg_gq to make the code a little bit easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 152 ++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index db034e35ae20..52aa0540ccaf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -870,6 +870,81 @@ static void blkcg_fill_root_iostats(void) } } +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) +{ + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + bool has_stats = false; + const char *dname; + unsigned seq; + char *buf; + size_t size = seq_get_buf(s, &buf), off = 0; + int i; + + if (!blkg->online) + return; + + dname = blkg_dev_name(blkg); + if (!dname) + return; + + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) + has_stats = true; + off += written; + } + + if (has_stats) { + if (off < size - 1) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(s, off); + } else { + seq_commit(s, -1); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -881,86 +956,11 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; - spin_lock_irq(&blkg->q->queue_lock); - - if (!blkg->online) - goto skip; - - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; - - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); - - do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } - - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; - - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; - - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } - - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } From 252c651a4c854b328445a536bd1892e999103fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:23 +0200 Subject: [PATCH 087/129] blk-cgroup: stop using seq_get_buf seq_get_buf is a crutch that undoes all the memory safety of the seq_file interface. Use the normal seq_printf interfaces instead. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 30 ++++++------------------------ block/blk-iocost.c | 23 +++++++++-------------- block/blk-iolatency.c | 38 +++++++++++++++++++------------------- block/mq-deadline-cgroup.c | 8 +++----- include/linux/blk-cgroup.h | 4 ++-- 5 files changed, 39 insertions(+), 64 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52aa0540ccaf..b8ec47dcce42 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -877,8 +877,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) bool has_stats = false; const char *dname; unsigned seq; - char *buf; - size_t size = seq_get_buf(s, &buf), off = 0; int i; if (!blkg->online) @@ -888,13 +886,7 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!dname) return; - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); + seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); @@ -909,40 +901,30 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (rbytes || wbytes || rios || wios) { has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; - off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) + if (pol->pd_stat_fn(blkg->pd[i], s)) has_stats = true; - off += written; } - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(s, off); - } else { - seq_commit(s, -1); - } - } + if (has_stats) + seq_printf(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5fac3757e6e0..89b21a360b2c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2988,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return false; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 81be0096411d..4c06fafb7411 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -886,8 +886,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -902,39 +901,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); + return true; } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return false; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); + return true; } - static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c index 3b4bfddec39f..b48a4b962f90 100644 --- a/block/mq-deadline-cgroup.c +++ b/block/mq-deadline-cgroup.c @@ -52,7 +52,7 @@ struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) return dd_blkcg_from_pd(pd); } -static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool dd_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { static const char *const prio_class_name[] = { [IOPRIO_CLASS_NONE] = "NONE", @@ -61,12 +61,10 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) [IOPRIO_CLASS_IDLE] = "IDLE", }; struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); - int res = 0; u8 prio; for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) - res += scnprintf(buf + res, size - res, - " [%s] dispatched=%u inserted=%u merged=%u", + seq_printf(s, " [%s] dispatched=%u inserted=%u merged=%u", prio_class_name[prio], ddcg_sum(blkcg, dispatched, prio) + ddcg_sum(blkcg, merged, prio) - @@ -75,7 +73,7 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) ddcg_sum(blkcg, completed, prio), ddcg_sum(blkcg, merged, prio)); - return res; + return true; } static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 37048438872c..b4de2010fba5 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -152,8 +152,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, - size_t size); +typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); struct blkcg_policy { int plid; From 69f87cc7086558ad84f20001256474aa611fc0eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:36:49 +0200 Subject: [PATCH 088/129] block: unexport blk_register_queue Not actually used in any modular code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816123649.601591-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1832587dce3a..586507a5b8c2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -939,7 +939,6 @@ unlock: return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() From a680dd72ec336b81511e3bff48efac6dbfa563e7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:57 +0900 Subject: [PATCH 089/129] block: bfq: fix bfq_set_next_ioprio_data() For a request that has a priority level equal to or larger than IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This is not consistent with the warning and the allowed values for priority levels. Fix this by setting the request new_ioprio field to IOPRIO_BE_NR - 1, the lowest priority level allowed. Cc: Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e4a61eda2d0f..e546a5f4bff9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5296,7 +5296,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_BE_NR - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); From 25bca50e523cbe96c0207fbb92f22ff2bc28e9aa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:58 +0900 Subject: [PATCH 090/129] block: improve ioprio class description comment In include/usapi/linux/ioprio.h, change the ioprio class enum comment to remove the outdated reference to CFQ and mention BFQ and mq-deadline instead. Also document the high priority NCQ command use for RT class IOs directed at ATA drives that support NCQ priority. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-3-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/ioprio.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 77b17e08b0da..6b735854aebd 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -13,10 +13,12 @@ #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) /* - * These are the io priority groups as implemented by CFQ. RT is the realtime - * class, it always gets premium service. BE is the best-effort scheduling - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. + * These are the io priority groups as implemented by the BFQ and mq-deadline + * schedulers. RT is the realtime class, it always gets premium service. For + * ATA disks supporting NCQ IO priority, RT class IOs will be processed using + * high priority NCQ commands. BE is the best-effort scheduling class, the + * default for any process. IDLE is the idle scheduling class, it is only + * served when no one else is using the disk. */ enum { IOPRIO_CLASS_NONE, From a553a835ca57668b0d9907d8ec2507ec51292d9a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:59 +0900 Subject: [PATCH 091/129] block: change ioprio_valid() to an inline function Change the ioprio_valid() macro in include/usapi/linux/ioprio.h to an inline function declared on the kernel side in include/linux/ioprio.h. Also improve checks on the class value by checking the upper bound value. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-4-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 10 ++++++++++ include/uapi/linux/ioprio.h | 2 -- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index ef9ad4fb245f..2ee3373684b1 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -8,6 +8,16 @@ #include +/* + * Check that a priority value has a valid class. + */ +static inline bool ioprio_valid(unsigned short ioprio) +{ + unsigned short class = IOPRIO_PRIO_CLASS(ioprio); + + return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; +} + /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 6b735854aebd..5064e230374c 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -27,8 +27,6 @@ enum { IOPRIO_CLASS_IDLE, }; -#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) - /* * 8 best effort priority levels are supported */ From ba05200fcce0a73fa8db16c514fbaa476d1d9399 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:00 +0900 Subject: [PATCH 092/129] block: fix IOPRIO_PRIO_CLASS() and IOPRIO_PRIO_VALUE() macros The ki_ioprio field of struct kiocb is 16-bits (u16) but often handled as an int in the block layer. E.g. ioprio_check_cap() takes an int as argument. With such implicit int casting function calls, the upper 16-bits of the int argument may be left uninitialized by the compiler, resulting in invalid values for the IOPRIO_PRIO_CLASS() macro (garbage upper bits) and in an error return for functions such as ioprio_check_cap(). Fix this by masking the result of the shift by IOPRIO_CLASS_SHIFT bits in the IOPRIO_PRIO_CLASS() macro. The new macro IOPRIO_CLASS_MASK defines the 3-bits mask for the priority class. Similarly, apply the IOPRIO_PRIO_MASK mask to the data argument of the IOPRIO_PRIO_VALUE() macro to ignore the upper bits of the data value. The IOPRIO_CLASS_MASK mask is also applied to the class argument of this macro before shifting the result by IOPRIO_CLASS_SHIFT bits. While at it, also change the argument name of the IOPRIO_PRIO_CLASS() and IOPRIO_PRIO_DATA() macros from "mask" to "ioprio" to reflect the fact that a priority value should be passed rather than a mask. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-5-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/ioprio.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 5064e230374c..936f0d8f30e1 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -5,12 +5,16 @@ /* * Gives us 8 prio classes with 13-bits of data for each class */ -#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_CLASS_SHIFT 13 +#define IOPRIO_CLASS_MASK 0x07 #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) -#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) +#define IOPRIO_PRIO_CLASS(ioprio) \ + (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK) +#define IOPRIO_PRIO_DATA(ioprio) ((ioprio) & IOPRIO_PRIO_MASK) +#define IOPRIO_PRIO_VALUE(class, data) \ + ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ + ((data) & IOPRIO_PRIO_MASK)) /* * These are the io priority groups as implemented by the BFQ and mq-deadline From 202bc942c5cd4340d37b06c4e0b8b03f9925d818 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:01 +0900 Subject: [PATCH 093/129] block: Introduce IOPRIO_NR_LEVELS The BFQ scheduler and ioprio_check_cap() both assume that the RT priority class (IOPRIO_CLASS_RT) can have up to 8 different priority levels, similarly to the BE class (IOPRIO_CLASS_iBE). This is controlled using the IOPRIO_BE_NR macro , which is badly named as the number of levels also applies to the RT class. Introduce the class independent IOPRIO_NR_LEVELS macro, defined to 8, to make things clear. Keep the old IOPRIO_BE_NR macro definition as an alias for IOPRIO_NR_LEVELS. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-6-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++---- block/bfq-iosched.h | 4 ++-- block/bfq-wf2q.c | 6 +++--- block/ioprio.c | 3 +-- fs/f2fs/sysfs.c | 2 +- include/uapi/linux/ioprio.h | 5 +++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e546a5f4bff9..4b434369e411 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2508,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd, int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); if (bfqg->async_idle_bfqq) @@ -5293,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR - 1; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); @@ -6825,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 99c2a3cb081e..385e28a843d1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -931,7 +931,7 @@ struct bfq_group { void *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct bfq_entity *my_entity; @@ -948,7 +948,7 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct rb_root rq_pos_tree; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 7a462df71f68..b74cc0da118e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); } static void bfq_get_entity(struct bfq_entity *entity) diff --git a/block/ioprio.c b/block/ioprio.c index bee628f9f1b2..ca6b136c5586 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 936f0d8f30e1..aac39338d02c 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -32,9 +32,10 @@ enum { }; /* - * 8 best effort priority levels are supported + * The RT and BE priority classes both support up to 8 priority levels. */ -#define IOPRIO_BE_NR (8) +#define IOPRIO_NR_LEVELS 8 +#define IOPRIO_BE_NR IOPRIO_NR_LEVELS enum { IOPRIO_WHO_PROCESS = 1, From e70344c05995a190a56bbd1a23dc2218bcc8c924 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:02 +0900 Subject: [PATCH 094/129] block: fix default IO priority handling The default IO priority is the best effort (BE) class with the normal priority level IOPRIO_NORM (4). However, get_task_ioprio() returns IOPRIO_CLASS_NONE/IOPRIO_NORM as the default priority and get_current_ioprio() returns IOPRIO_CLASS_NONE/0. Let's be consistent with the defined default and have both of these functions return the default priority IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM) when the user did not define another default IO priority for the task. In include/uapi/linux/ioprio.h, introduce the IOPRIO_BE_NORM macro as an alias to IOPRIO_NORM to clarify that this default level applies to the BE priotity class. In include/linux/ioprio.h, define the macro IOPRIO_DEFAULT as IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) and use this new macro when setting a priority to the default. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-7-damien.lemoal@wdc.com [axboe: drop unnecessary lightnvm change] Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/ioprio.c | 6 +++--- include/linux/ioprio.h | 7 ++++++- include/uapi/linux/ioprio.h | 5 +++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b434369e411..e92bc0348433 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5411,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, case IOPRIO_CLASS_RT: return &bfqg->async_bfqq[0][ioprio]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; + ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: return &bfqg->async_bfqq[1][ioprio]; diff --git a/block/ioprio.c b/block/ioprio.c index ca6b136c5586..0e4ff245f2bf 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -170,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + ret = IOPRIO_DEFAULT; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; @@ -182,9 +182,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aprio = IOPRIO_DEFAULT; if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + bprio = IOPRIO_DEFAULT; return min(aprio, bprio); } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 2ee3373684b1..3f53bc27a19b 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -8,6 +8,11 @@ #include +/* + * Default IO priority. + */ +#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) + /* * Check that a priority value has a valid class. */ @@ -51,7 +56,7 @@ static inline int get_current_ioprio(void) if (ioc) return ioc->ioprio; - return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + return IOPRIO_DEFAULT; } /* diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index aac39338d02c..f70f2596a6bf 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -44,8 +44,9 @@ enum { }; /* - * Fallback BE priority + * Fallback BE priority level. */ -#define IOPRIO_NORM (4) +#define IOPRIO_NORM 4 +#define IOPRIO_BE_NORM IOPRIO_NORM #endif /* _UAPI_LINUX_IOPRIO_H */ From 759e0fd4b67766c96b33a114bba0c7d7521fecd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Aug 2021 11:49:29 +0200 Subject: [PATCH 095/129] block: add back the bd_holder_dir reference in bd_link_disk_holder This essentially reverts "block: remove the extra kobject reference in bd_link_disk_holder". That commit dropped the extra reference because the condition in the comment can't be true. But it turns out that comment did not actually describe the problematic situation, so add back the extra reference and document it properly. Fixes: fbd9a39542ec ("block: remove the extra kobject reference in bd_link_disk_holder") Reported-by: Tushar Sugandhi Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/holder.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/holder.c b/block/holder.c index 4568cc4f6827..9dc084182337 100644 --- a/block/holder.c +++ b/block/holder.c @@ -106,6 +106,12 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } list_add(&holder->list, &disk->slave_bdevs); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we need + * to keep our own here to allow for cleanup past that point. + */ + kobject_get(bdev->bd_holder_dir); + out_unlock: mutex_unlock(&disk->open_mutex); return ret; @@ -138,6 +144,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { if (disk->slave_dir) __unlink_disk_holder(bdev, disk); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } From 5f432cceb3e9de5223fa50d882c4a43cab39a3ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:02 +0200 Subject: [PATCH 096/129] nvme: use blk_mq_alloc_disk Switch to use the blk_mq_alloc_disk helper for allocating the request_queue and gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20210816131910.615153-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9e51266c9e2c..68acd33c3856 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3728,9 +3728,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!ns) goto out_free_id; - ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) + disk = blk_mq_alloc_disk(ctrl->tagset, ns); + if (IS_ERR(disk)) goto out_free_ns; + disk->fops = &nvme_bdev_ops; + disk->private_data = ns; + + ns->disk = disk; + ns->queue = disk->queue; if (ctrl->opts && ctrl->opts->data_digest) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); @@ -3739,20 +3744,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - ns->queue->queuedata = ns; ns->ctrl = ctrl; kref_init(&ns->kref); if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) - goto out_free_queue; + goto out_cleanup_disk; - disk = alloc_disk_node(0, node); - if (!disk) - goto out_unlink_ns; - - disk->fops = &nvme_bdev_ops; - disk->private_data = ns; - disk->queue = ns->queue; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the @@ -3761,15 +3758,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - ns->disk = disk; if (nvme_update_ns_info(ns, id)) - goto out_put_disk; + goto out_unlink_ns; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk->disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; + goto out_unlink_ns; } } @@ -3788,10 +3784,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, kfree(id); return; - out_put_disk: - /* prevent double queue cleanup */ - ns->disk->queue = NULL; - put_disk(ns->disk); + out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3799,8 +3792,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_queue: - blk_cleanup_queue(ns->queue); + out_cleanup_disk: + blk_cleanup_disk(disk); out_free_ns: kfree(ns); out_free_id: From 45938335d0a9773d65a82a7ca722bb76e4b997a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:03 +0200 Subject: [PATCH 097/129] st: do not allocate a gendisk st is a character driver and thus does not need to allocate a gendisk, which is only used for file system-like block layer I/O on block devices. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/st.c | 49 ++++++++++++----------------------------------- drivers/scsi/st.h | 2 +- 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index c6f14540ae03..d1abc020f3c0 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -309,13 +309,8 @@ static char * st_incompatible(struct scsi_device* SDp) } -static inline char *tape_name(struct scsi_tape *tape) -{ - return tape->disk->disk_name; -} - #define st_printk(prefix, t, fmt, a...) \ - sdev_prefix_printk(prefix, (t)->device, tape_name(t), fmt, ##a) + sdev_prefix_printk(prefix, (t)->device, (t)->name, fmt, ##a) #ifdef DEBUG #define DEBC_printk(t, fmt, a...) \ if (debugging) { st_printk(ST_DEB_MSG, t, fmt, ##a ); } @@ -363,7 +358,7 @@ static int st_chk_result(struct scsi_tape *STp, struct st_request * SRpnt) int result = SRpnt->result; u8 scode; DEB(const char *stp;) - char *name = tape_name(STp); + char *name = STp->name; struct st_cmdstatus *cmdstatp; if (!result) @@ -3841,8 +3836,9 @@ static long st_ioctl_common(struct file *file, unsigned int cmd_in, void __user !capable(CAP_SYS_RAWIO)) i = -EPERM; else - i = scsi_cmd_ioctl(STp->disk->queue, STp->disk, - file->f_mode, cmd_in, p); + i = scsi_cmd_ioctl(STp->device->request_queue, + NULL, file->f_mode, cmd_in, + p); if (i != -ENOTTY) return i; break; @@ -4216,7 +4212,7 @@ static int create_one_cdev(struct scsi_tape *tape, int mode, int rew) i = mode << (4 - ST_NBR_MODE_BITS); snprintf(name, 10, "%s%s%s", rew ? "n" : "", - tape->disk->disk_name, st_formats[i]); + tape->name, st_formats[i]); dev = device_create(&st_sysfs_class, &tape->device->sdev_gendev, cdev_devno, &tape->modes[mode], "%s", name); @@ -4271,7 +4267,6 @@ static void remove_cdevs(struct scsi_tape *tape) static int st_probe(struct device *dev) { struct scsi_device *SDp = to_scsi_device(dev); - struct gendisk *disk = NULL; struct scsi_tape *tpnt = NULL; struct st_modedef *STm; struct st_partstat *STps; @@ -4301,27 +4296,13 @@ static int st_probe(struct device *dev) goto out; } - disk = alloc_disk(1); - if (!disk) { - sdev_printk(KERN_ERR, SDp, - "st: out of memory. Device not attached.\n"); - goto out_buffer_free; - } - tpnt = kzalloc(sizeof(struct scsi_tape), GFP_KERNEL); if (tpnt == NULL) { sdev_printk(KERN_ERR, SDp, "st: Can't allocate device descriptor.\n"); - goto out_put_disk; + goto out_buffer_free; } kref_init(&tpnt->kref); - tpnt->disk = disk; - disk->private_data = &tpnt->driver; - /* SCSI tape doesn't register this gendisk via add_disk(). Manually - * take queue reference that release_disk() expects. */ - if (!blk_get_queue(SDp->request_queue)) - goto out_put_disk; - disk->queue = SDp->request_queue; tpnt->driver = &st_template; tpnt->device = SDp; @@ -4394,10 +4375,10 @@ static int st_probe(struct device *dev) idr_preload_end(); if (error < 0) { pr_warn("st: idr allocation failed: %d\n", error); - goto out_put_queue; + goto out_free_tape; } tpnt->index = error; - sprintf(disk->disk_name, "st%d", tpnt->index); + sprintf(tpnt->name, "st%d", tpnt->index); tpnt->stats = kzalloc(sizeof(struct scsi_tape_stats), GFP_KERNEL); if (tpnt->stats == NULL) { sdev_printk(KERN_ERR, SDp, @@ -4414,9 +4395,9 @@ static int st_probe(struct device *dev) scsi_autopm_put_device(SDp); sdev_printk(KERN_NOTICE, SDp, - "Attached scsi tape %s\n", tape_name(tpnt)); + "Attached scsi tape %s\n", tpnt->name); sdev_printk(KERN_INFO, SDp, "%s: try direct i/o: %s (alignment %d B)\n", - tape_name(tpnt), tpnt->try_dio ? "yes" : "no", + tpnt->name, tpnt->try_dio ? "yes" : "no", queue_dma_alignment(SDp->request_queue) + 1); return 0; @@ -4428,10 +4409,7 @@ out_idr_remove: spin_lock(&st_index_lock); idr_remove(&st_index_idr, tpnt->index); spin_unlock(&st_index_lock); -out_put_queue: - blk_put_queue(disk->queue); -out_put_disk: - put_disk(disk); +out_free_tape: kfree(tpnt); out_buffer_free: kfree(buffer); @@ -4470,7 +4448,6 @@ static int st_remove(struct device *dev) static void scsi_tape_release(struct kref *kref) { struct scsi_tape *tpnt = to_scsi_tape(kref); - struct gendisk *disk = tpnt->disk; tpnt->device = NULL; @@ -4480,8 +4457,6 @@ static void scsi_tape_release(struct kref *kref) kfree(tpnt->buffer); } - disk->private_data = NULL; - put_disk(disk); kfree(tpnt->stats); kfree(tpnt); return; diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h index 9d3c38bb0794..c0ef0d9aaf8a 100644 --- a/drivers/scsi/st.h +++ b/drivers/scsi/st.h @@ -187,7 +187,7 @@ struct scsi_tape { unsigned char last_cmnd[6]; unsigned char last_sense[16]; #endif - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct kref kref; struct scsi_tape_stats *stats; }; From aebbb5831fbd5352fd9bd2c858bc249026d3c652 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:04 +0200 Subject: [PATCH 098/129] sg: do not allocate a gendisk sg is a character driver and thus does not need to allocate a gendisk, which is only used for file system-like block layer I/O on block devices. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20210816131910.615153-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 91e2221bbb0d..477267add49d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -166,7 +166,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ bool exclude; /* 1->open(O_EXCL) succeeded and is active */ int open_cnt; /* count of opens (perhaps < num(sfds) ) */ char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */ - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct cdev * cdev; /* char_dev [sysfs: /sys/cdev/major/sg] */ struct kref d_ref; } Sg_device; @@ -202,8 +202,7 @@ static void sg_device_destroy(struct kref *kref); #define SZ_SG_REQ_INFO sizeof(sg_req_info_t) #define sg_printk(prefix, sdp, fmt, a...) \ - sdev_prefix_printk(prefix, (sdp)->device, \ - (sdp)->disk->disk_name, fmt, ##a) + sdev_prefix_printk(prefix, (sdp)->device, (sdp)->name, fmt, ##a) /* * The SCSI interfaces that use read() and write() as an asynchronous variant of @@ -832,7 +831,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(NULL, srp->rq, at_head, sg_rq_end_io); return 0; } @@ -1119,8 +1118,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: - return blk_trace_setup(sdp->device->request_queue, - sdp->disk->disk_name, + return blk_trace_setup(sdp->device->request_queue, NULL, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: @@ -1456,7 +1454,7 @@ static struct class *sg_sysfs_class; static int sg_sysfs_valid = 0; static Sg_device * -sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) +sg_alloc(struct scsi_device *scsidp) { struct request_queue *q = scsidp->request_queue; Sg_device *sdp; @@ -1492,9 +1490,7 @@ sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) SCSI_LOG_TIMEOUT(3, sdev_printk(KERN_INFO, scsidp, "sg_alloc: dev=%d \n", k)); - sprintf(disk->disk_name, "sg%d", k); - disk->first_minor = k; - sdp->disk = disk; + sprintf(sdp->name, "sg%d", k); sdp->device = scsidp; mutex_init(&sdp->open_rel_lock); INIT_LIST_HEAD(&sdp->sfds); @@ -1521,19 +1517,11 @@ static int sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) { struct scsi_device *scsidp = to_scsi_device(cl_dev->parent); - struct gendisk *disk; Sg_device *sdp = NULL; struct cdev * cdev = NULL; int error; unsigned long iflags; - disk = alloc_disk(1); - if (!disk) { - pr_warn("%s: alloc_disk failed\n", __func__); - return -ENOMEM; - } - disk->major = SCSI_GENERIC_MAJOR; - error = -ENOMEM; cdev = cdev_alloc(); if (!cdev) { @@ -1543,7 +1531,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) cdev->owner = THIS_MODULE; cdev->ops = &sg_fops; - sdp = sg_alloc(disk, scsidp); + sdp = sg_alloc(scsidp); if (IS_ERR(sdp)) { pr_warn("%s: sg_alloc failed\n", __func__); error = PTR_ERR(sdp); @@ -1561,7 +1549,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) sg_class_member = device_create(sg_sysfs_class, cl_dev->parent, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), - sdp, "%s", disk->disk_name); + sdp, "%s", sdp->name); if (IS_ERR(sg_class_member)) { pr_err("%s: device_create failed\n", __func__); error = PTR_ERR(sg_class_member); @@ -1589,7 +1577,6 @@ cdev_add_err: kfree(sdp); out: - put_disk(disk); if (cdev) cdev_del(cdev); return error; @@ -1613,7 +1600,6 @@ sg_device_destroy(struct kref *kref) SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_device_destroy\n")); - put_disk(sdp->disk); kfree(sdp); } @@ -2606,7 +2592,7 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v) goto skip; read_lock(&sdp->sfd_lock); if (!list_empty(&sdp->sfds)) { - seq_printf(s, " >>> device=%s ", sdp->disk->disk_name); + seq_printf(s, " >>> device=%s ", sdp->name); if (atomic_read(&sdp->detaching)) seq_puts(s, "detaching pending close "); else if (sdp->device) { From 4dcc4874deb41a11ece9c6e8858385235463c1ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:05 +0200 Subject: [PATCH 099/129] block: cleanup the lockdep handling in *alloc_disk Pass the lockdep name to the low-level __blk_alloc_disk helper and hardcode the name for it given that the number of minors or node_id are not very useful information. While this passes a pointless argument for non-lockdep builds that is not really an issue as disk allocation is a probe time only slow path. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- block/genhd.c | 8 +++++--- include/linux/blk-mq.h | 10 +++------- include/linux/genhd.h | 23 ++++++----------------- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d2725f94491d..4c56e43e6992 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3133,7 +3133,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -3142,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node); + disk = __alloc_disk_node(0, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 731a46063132..2ad2b25dfc87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(int minors, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1282,6 +1283,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif @@ -1298,7 +1300,7 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); -struct gendisk *__blk_alloc_disk(int node) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -1307,7 +1309,7 @@ struct gendisk *__blk_alloc_disk(int node) if (!q) return NULL; - disk = __alloc_disk_node(0, node); + disk = __alloc_disk_node(0, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 22215db36122..13ba1861e688 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -432,18 +432,14 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ ({ \ static struct lock_class_key __key; \ - struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ \ - if (!IS_ERR(__disk)) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_mq_alloc_disk(set, queuedata, &__key); \ }) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, - void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b47e297cd551..3d2e5ee30677 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,27 +259,21 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -extern struct gendisk *__alloc_disk_node(int minors, int node_id); +struct gendisk *__alloc_disk_node(int minors, int node_id, + struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); #define alloc_disk_node(minors, node_id) \ ({ \ static struct lock_class_key __key; \ - const char *__name; \ - struct gendisk *__disk; \ \ - __name = "(gendisk_completion)"#minors"("#node_id")"; \ - \ - __disk = __alloc_disk_node(minors, node_id); \ - \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \ - \ - __disk; \ + __alloc_disk_node(minors, node_id, &__key); \ }) #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); + /** * blk_alloc_disk - allocate a gendisk structure * @node_id: numa node to allocate on @@ -291,15 +285,10 @@ extern void put_disk(struct gendisk *disk); */ #define blk_alloc_disk(node_id) \ ({ \ - struct gendisk *__disk = __blk_alloc_disk(node_id); \ static struct lock_class_key __key; \ \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_alloc_disk(node_id, &__key); \ }) -struct gendisk *__blk_alloc_disk(int node); void blk_cleanup_disk(struct gendisk *disk); int __register_blkdev(unsigned int major, const char *name, From 9c2b9dbafc067e173db30c4fd0636392d27944e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:06 +0200 Subject: [PATCH 100/129] block: remove alloc_disk and alloc_disk_node Most drivers should use and have been converted to use blk_alloc_disk and blk_mq_alloc_disk. Only the scsi ULPs and dasd still allocate a disk separately from the request_queue, so don't bother with convenience macros for something that should not see significant new users and remove these wrappers. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_genhd.c | 5 ++++- drivers/scsi/sd.c | 3 ++- drivers/scsi/sr.c | 4 +++- include/linux/genhd.h | 10 ---------- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 493e8469893c..07a69b19dd31 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -24,6 +24,8 @@ #include "dasd_int.h" +static struct lock_class_key dasd_bio_compl_lkclass; + /* * Allocate and register gendisk structure for device. */ @@ -38,7 +40,8 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = alloc_disk(1 << DASD_PARTN_BITS); + gdp = __alloc_disk_node(1 << DASD_PARTN_BITS, NUMA_NO_NODE, + &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b5b8266e142..a9535c6484de 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -129,6 +129,7 @@ static DEFINE_MUTEX(sd_ref_mutex); static struct kmem_cache *sd_cdb_cache; static mempool_t *sd_cdb_pool; static mempool_t *sd_page_pool; +static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", @@ -3408,7 +3409,7 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = alloc_disk(SD_MINORS); + gd = __alloc_disk_node(SD_MINORS, NUMA_NO_NODE, &sd_bio_compl_lkclass); if (!gd) goto out_free; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 94c254e9012e..fee2bdfe6132 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -106,6 +106,8 @@ static struct scsi_driver sr_template = { static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG]; static DEFINE_SPINLOCK(sr_index_lock); +static struct lock_class_key sr_bio_compl_lkclass; + /* This semaphore is used to mediate the 0->1 reference get in the * face of object destruction (i.e. we can't allow a get on an * object after last put) */ @@ -712,7 +714,7 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = alloc_disk(1); + disk = __alloc_disk_node(1, NUMA_NO_NODE, &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3d2e5ee30677..ceda9b255dba 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -262,16 +262,6 @@ void blk_drop_partitions(struct gendisk *disk); struct gendisk *__alloc_disk_node(int minors, int node_id, struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); - -#define alloc_disk_node(minors, node_id) \ -({ \ - static struct lock_class_key __key; \ - \ - __alloc_disk_node(minors, node_id, &__key); \ -}) - -#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) - struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); /** From a58bd7683fcb60ae24c8572f932b48bc65719b7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:07 +0200 Subject: [PATCH 101/129] block: remove the minors argument to __alloc_disk_node This was a leftover from the legacy alloc_disk interface. Switch the scsi ULPs and dasd to set ->minors directly like all other drivers and remove the argument. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland [dasd] Link: https://lore.kernel.org/r/20210816131910.615153-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/genhd.c | 6 ++---- drivers/s390/block/dasd_genhd.c | 4 ++-- drivers/scsi/sd.c | 3 ++- drivers/scsi/sr.c | 3 ++- include/linux/genhd.h | 3 +-- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c56e43e6992..8ac30c343c06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node, lkclass); + disk = __alloc_disk_node(set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 2ad2b25dfc87..caeda726189c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,8 +1254,7 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id, - struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1277,7 +1276,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1309,7 +1307,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(0, node, lkclass); + disk = __alloc_disk_node(node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 07a69b19dd31..6e44515b4d33 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -40,14 +40,14 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = __alloc_disk_node(1 << DASD_PARTN_BITS, NUMA_NO_NODE, - &dasd_bio_compl_lkclass); + gdp = __alloc_disk_node(NUMA_NO_NODE, &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; gdp->first_minor = base->devindex << DASD_PARTN_BITS; + gdp->minors = 1 << DASD_PARTN_BITS; gdp->fops = &dasd_device_operations; /* diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a9535c6484de..1c6b8f012219 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3409,7 +3409,7 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = __alloc_disk_node(SD_MINORS, NUMA_NO_NODE, &sd_bio_compl_lkclass); + gd = __alloc_disk_node(NUMA_NO_NODE, &sd_bio_compl_lkclass); if (!gd) goto out_free; @@ -3455,6 +3455,7 @@ static int sd_probe(struct device *dev) gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); + gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = &sdkp->driver; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index fee2bdfe6132..2c45b4140e67 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -714,7 +714,7 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = __alloc_disk_node(1, NUMA_NO_NODE, &sr_bio_compl_lkclass); + disk = __alloc_disk_node(NUMA_NO_NODE, &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); @@ -731,6 +731,7 @@ static int sr_probe(struct device *dev) disk->major = SCSI_CDROM_MAJOR; disk->first_minor = minor; + disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ceda9b255dba..d20f101be758 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,8 +259,7 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -struct gendisk *__alloc_disk_node(int minors, int node_id, - struct lock_class_key *lkclass); +struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); From 4a1fa41d304c7129328d4d5c7f31715b95e23b29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:08 +0200 Subject: [PATCH 102/129] block: pass a request_queue to __blk_alloc_disk Pass in a request_queue and assign disk->queue in __blk_alloc_disk to ensure struct gendisk always has a valid ->queue pointer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- block/genhd.c | 7 ++++--- drivers/s390/block/dasd_genhd.c | 4 ++-- drivers/scsi/sd.c | 4 ++-- drivers/scsi/sr.c | 4 ++-- include/linux/genhd.h | 3 ++- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac30c343c06..2ca7e7c94b18 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,12 +3143,11 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(set->numa_node, lkclass); + disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); diff --git a/block/genhd.c b/block/genhd.c index caeda726189c..f18122ee2778 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1281,6 +1282,7 @@ struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + disk->queue = q; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); @@ -1307,12 +1309,11 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(node, lkclass); + disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_alloc_disk); diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 6e44515b4d33..fa966e0db6ca 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -40,7 +40,8 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = __alloc_disk_node(NUMA_NO_NODE, &dasd_bio_compl_lkclass); + gdp = __alloc_disk_node(block->request_queue, NUMA_NO_NODE, + &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; @@ -76,7 +77,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) set_disk_ro(gdp, 1); dasd_add_link_to_gendisk(gdp, base); - gdp->queue = block->request_queue; block->gdp = gdp; set_capacity(block->gdp, 0); device_add_disk(&base->cdev->dev, block->gdp, NULL); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1c6b8f012219..610ebba0d66e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3409,7 +3409,8 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = __alloc_disk_node(NUMA_NO_NODE, &sd_bio_compl_lkclass); + gd = __alloc_disk_node(sdp->request_queue, NUMA_NO_NODE, + &sd_bio_compl_lkclass); if (!gd) goto out_free; @@ -3459,7 +3460,6 @@ static int sd_probe(struct device *dev) gd->fops = &sd_fops; gd->private_data = &sdkp->driver; - gd->queue = sdkp->device->request_queue; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 2c45b4140e67..a0df27db4d61 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -714,7 +714,8 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = __alloc_disk_node(NUMA_NO_NODE, &sr_bio_compl_lkclass); + disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE, + &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); @@ -765,7 +766,6 @@ static int sr_probe(struct device *dev) set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; - disk->queue = sdev->request_queue; if (register_cdrom(disk, &cd->cdi)) goto fail_minor; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index d20f101be758..13e90e6231d8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,7 +259,8 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass); +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); From 61a35cfc26334fe1c8e970ca8fafeae2daae257d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:09 +0200 Subject: [PATCH 103/129] block: hold a request_queue reference for the lifetime of struct gendisk Acquire the queue ref dropped in disk_release in __blk_alloc_disk so any allocate gendisk always has a queue reference. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 19 +++++++------------ include/linux/genhd.h | 1 - 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f18122ee2778..6294517cebe6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -551,15 +551,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, register_disk(parent, disk, groups); blk_register_queue(disk); - /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. - */ - if (blk_get_queue(disk->queue)) - set_bit(GD_QUEUE_REF, &disk->state); - else - WARN_ON_ONCE(1); - disk_add_events(disk); blk_integrity_add(disk); } @@ -1087,8 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) - blk_put_queue(disk->queue); + blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1259,9 +1249,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, { struct gendisk *disk; + if (!blk_get_queue(q)) + return NULL; + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) - return NULL; + goto out_put_queue; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) @@ -1296,6 +1289,8 @@ out_free_bdi: bdi_put(disk->bdi); out_free_disk: kfree(disk); +out_put_queue: + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(__alloc_disk_node); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13e90e6231d8..55acefdd8a20 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -149,7 +149,6 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 -#define GD_QUEUE_REF 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ From d152c682f03ceb65c0d9663d4ba6ee2d46aa784d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:46:24 +0200 Subject: [PATCH 104/129] block: add an explicit ->disk backpointer to the request_queue Replace the magic lookup through the kobject tree with an explicit backpointer, given that the device model links are set up and torn down at times when I/O is still possible, leading to potential NULL or invalid pointer dereferences. Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk") Reported-by: syzbot Signed-off-by: Christoph Hellwig Tested-by: Sven Schnelle Link: https://lore.kernel.org/r/20210816134624.GA24234@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-cgroup.c | 4 ++-- block/blk-mq.c | 2 +- block/blk-settings.c | 8 ++++---- block/blk-sysfs.c | 13 ++++++------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 2 ++ include/linux/blkdev.h | 5 ++--- include/trace/events/kyber.h | 6 +++--- 9 files changed, 26 insertions(+), 26 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e92bc0348433..480e1a134859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5269,7 +5269,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8ec47dcce42..f575aa42922b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,9 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + if (!blkg->q->disk || !blkg->q->disk->bdi->dev) return NULL; - return bdi_dev_name(queue_to_disk(blkg->q)->bdi); + return bdi_dev_name(blkg->q->disk->bdi); } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 2ca7e7c94b18..0a33d16a7298 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(queue_to_disk(q)->bdi); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 3613d2cc0688..a7c857ad7d10 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -141,9 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -475,9 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->ra_pages = + q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 586507a5b8c2..7fd99487300c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -90,9 +90,9 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page) { unsigned long ra_kb; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; - ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -102,12 +102,12 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) unsigned long ra_kb; ssize_t ret; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -254,9 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - if (queue_has_disk(q)) - queue_to_disk(q)->bdi->io_pages = - max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 31086afaad9c..874c1c37bf0c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; + struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, - rqd->scale_step, inflight); + trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, + inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index 6294517cebe6..02cd9ec93e52 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1078,6 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1276,6 +1277,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, device_initialize(disk_to_dev(disk)); inc_diskseq(disk); disk->queue = q; + q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index df404c1fb087..22b5b8502d2a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -421,6 +421,8 @@ struct request_queue { spinlock_t queue_lock; + struct gendisk *disk; + /* * queue kobject */ @@ -661,9 +663,6 @@ extern void blk_clear_pm_only(struct request_queue *q); dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) -#define queue_has_disk(q) ((q)->kobj.parent != NULL) -#define queue_to_disk(q) (dev_to_disk(kobj_to_dev((q)->kobj.parent))) - static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index f9802562edf6..491098a0d8ed 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strlcpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; @@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); ), From 40b3a52ffc5bc3b5427d5d35b035cfb19d03fdd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:32 +0200 Subject: [PATCH 105/129] block: add a sanity check for a live disk in del_gendisk Add a sanity check to del_gendisk to do nothing when the disk wasn't successfully added. This papers over the complete lack of add_disk error handling, which is about to get fixed gradually. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 02cd9ec93e52..935f74c652f1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -579,7 +579,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk->queue)) + if (WARN_ON_ONCE(!disk_live(disk))) return; blk_integrity_del(disk); From 52b85909f85d06efa69aaf4210e72467f1f58d2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:33 +0200 Subject: [PATCH 106/129] block: fold register_disk into device_add_disk There is no real reason these should be separate. Also simplify the groups assignment a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 131 +++++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 71 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 935f74c652f1..ec4be5889fbf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -409,71 +409,6 @@ static void disk_scan_partitions(struct gendisk *disk) blkdev_put(bdev, FMODE_READ); } -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) -{ - struct device *ddev = disk_to_dev(disk); - int err; - - ddev->parent = parent; - - dev_set_name(ddev, "%s", disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - - /* - * avoid probable deadlock caused by allocating memory with - * GFP_KERNEL in runtime_resume callback of its all ancestor - * devices - */ - pm_runtime_set_memalloc_noio(ddev, true); - - disk->part0->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } - - if (disk->flags & GENHD_FL_HIDDEN) - return; - - disk_scan_partitions(disk); - - /* announce the disk and partitions after all partitions are created */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, - "bdi"); - WARN_ON(err); - } -} - /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk @@ -490,6 +425,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { + struct device *ddev = disk_to_dev(disk); int ret; /* @@ -538,17 +474,70 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct device *dev = disk_to_dev(disk); - /* Register BDI before referencing it from bdev */ - dev->devt = MKDEV(disk->major, disk->first_minor); + ddev->devt = MKDEV(disk->major, disk->first_minor); ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(disk->bdi, dev); - bdev_add(disk->part0, dev->devt); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); } - register_disk(parent, disk, groups); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + ddev->parent = parent; + ddev->groups = groups; + dev_set_name(ddev, "%s", disk->disk_name); + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + ret = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (ret) { + device_del(ddev); + return; + } + } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + disk_scan_partitions(disk); + + /* + * Announce the disk and partitions after all partitions are + * created. + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + } + blk_register_queue(disk); disk_add_events(disk); From 8235b5c1e8c1c0537f03a21a2e380098bed25248 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:34 +0200 Subject: [PATCH 107/129] block: call bdev_add later in device_add_disk Once bdev_add is called userspace can open the block device. Ensure that the struct device, which is used for refcounting of the disk besides various other things, is fully setup at that point. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ec4be5889fbf..ab455f110be2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,29 +466,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_alloc_events(disk); - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { - /* Register BDI before referencing it from bdev */ - ddev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(disk->bdi, "%u:%u", - disk->major, disk->first_minor); - WARN_ON(ret); - bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); - } - /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); if (device_add(ddev)) return; if (!sysfs_deprecated) { @@ -521,12 +506,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } - if (!(disk->flags & GENHD_FL_HIDDEN)) { + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + ret = bdi_register(disk->bdi, "%u:%u", + disk->major, disk->first_minor); + WARN_ON(ret); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); + disk_scan_partitions(disk); /* * Announce the disk and partitions after all partitions are - * created. + * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); From 9d5ee6767c85762205b788ed1245f21fafd6c504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:35 +0200 Subject: [PATCH 108/129] block: create the bdi link earlier in device_add_disk This will simplify error handling going forward. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ab455f110be2..f05e58f214d2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -518,8 +518,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->major, disk->first_minor); WARN_ON(ret); bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); /* @@ -528,12 +533,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } } blk_register_queue(disk); From bab53f6b617d9f530978d6e3693f88e586d81a8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:36 +0200 Subject: [PATCH 109/129] block: call blk_integrity_add earlier in device_add_disk Doing all the sysfs file creation before adding the bdev and thus allowing it to be opened will simplify the about to be added error handling. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index f05e58f214d2..75d900e4c82f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -492,6 +492,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); + blk_integrity_add(disk); + disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -538,7 +540,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, blk_register_queue(disk); disk_add_events(disk); - blk_integrity_add(disk); } EXPORT_SYMBOL(device_add_disk); From 75f4dca59694dfe288ae6a48d7b147b60d11c95c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:37 +0200 Subject: [PATCH 110/129] block: call blk_register_queue earlier in device_add_disk Ensure that all the sysfs bits are set up before bdev_add is called, as that will make the upcomding error handling much easier. However this means the call to disk_update_readahead has to be split as that requires a bdi. Also remove various sanity checks that don't make sense now that blk_register_queue only has a single caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 9 --------- block/genhd.c | 5 +++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7fd99487300c..614d9d47de36 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -856,15 +856,6 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - if (WARN_ON(!q)) - return -ENXIO; - - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); - - disk_update_readahead(disk); - ret = blk_trace_init_sysfs(dev); if (ret) return ret; diff --git a/block/genhd.c b/block/genhd.c index 75d900e4c82f..a54b4849242c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -508,6 +508,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } + blk_register_queue(disk); + if (disk->flags & GENHD_FL_HIDDEN) { /* * Don't let hidden disks show up in /proc/partitions, @@ -537,8 +539,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_uevent(disk, KOBJ_ADD); } - blk_register_queue(disk); - + disk_update_readahead(disk); disk_add_events(disk); } EXPORT_SYMBOL(device_add_disk); From 614310c9c8ca15359f4e71a5bbd9165897b4d54e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:38 +0200 Subject: [PATCH 111/129] block: return errors from blk_integrity_add Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 12 +++++++----- block/blk.h | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 410da060d1f5..69a12177dfb6 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -431,13 +431,15 @@ void blk_integrity_unregister(struct gendisk *disk) } EXPORT_SYMBOL(blk_integrity_unregister); -void blk_integrity_add(struct gendisk *disk) +int blk_integrity_add(struct gendisk *disk) { - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) - return; + int ret; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity"); + if (!ret) + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + return ret; } void blk_integrity_del(struct gendisk *disk) diff --git a/block/blk.h b/block/blk.h index 148bdcd3aa08..c9727dd56da7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -132,7 +132,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); +int blk_integrity_add(struct gendisk *disk); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, @@ -166,8 +166,9 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) +static inline int blk_integrity_add(struct gendisk *disk) { + return 0; } static inline void blk_integrity_del(struct gendisk *disk) { From 92e7755ebc69233e25a2d1b760aeff536dc4016b Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:39 +0200 Subject: [PATCH 112/129] block: return errors from disk_alloc_events Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/disk-events.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk.h b/block/blk.h index c9727dd56da7..bbbcc1a64a2d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -362,7 +362,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -void disk_alloc_events(struct gendisk *disk); +int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); diff --git a/block/disk-events.c b/block/disk-events.c index 7445b8ff2775..8d5496e7592a 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -444,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -void disk_alloc_events(struct gendisk *disk) +int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) - return; + return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; + return -ENOMEM; } INIT_LIST_HEAD(&ev->node); @@ -466,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk) INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; + return 0; } void disk_add_events(struct gendisk *disk) From 83cbce9574462c6b4eed6797bdaf18fae6859ab3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:40 +0200 Subject: [PATCH 113/129] block: add error handling for device_add_disk / add_disk Properly unwind on errors in device_add_disk. This is the initial work as drivers are not converted yet, which will follow in separate patches. Signed-off-by: Luis Chamberlain [hch: major rebase. All bugs are probably mine] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 92 +++++++++++++++++++++++++++---------------- include/linux/genhd.h | 8 ++-- 2 files changed, 62 insertions(+), 38 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index a54b4849242c..a925f773145f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -417,11 +417,8 @@ static void disk_scan_partitions(struct gendisk *disk) * * This function registers the partitioning information in @disk * with the kernel. - * - * FIXME: error handling */ - -void device_add_disk(struct device *parent, struct gendisk *disk, +int device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { @@ -444,7 +441,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, * and all partitions from the extended dev_t space. */ if (disk->major) { - WARN_ON(!disk->minors); + if (WARN_ON(!disk->minors)) + return -EINVAL; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -452,19 +450,20 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } } else { - WARN_ON(disk->minors); + if (WARN_ON(disk->minors)) + return -EINVAL; ret = blk_alloc_ext_minor(); - if (ret < 0) { - WARN_ON(1); - return; - } + if (ret < 0) + return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = MINOR(ret); disk->flags |= GENHD_FL_EXT_DEVT; } - disk_alloc_events(disk); + ret = disk_alloc_events(disk); + if (ret) + goto out_free_ext_minor; /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -474,15 +473,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, dev_set_name(ddev, "%s", disk->disk_name); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); - if (device_add(ddev)) - return; + ret = device_add(ddev); + if (ret) + goto out_disk_release_events; if (!sysfs_deprecated) { ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); - if (ret) { - device_del(ddev); - return; - } + if (ret) + goto out_device_del; } /* @@ -492,23 +490,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - blk_integrity_add(disk); + ret = blk_integrity_add(disk); + if (ret) + goto out_del_block_link; disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) + goto out_del_integrity; disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) + goto out_put_holder_dir; - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } + ret = bd_register_pending_holders(disk); + if (ret < 0) + goto out_put_slave_dir; - blk_register_queue(disk); + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; if (disk->flags & GENHD_FL_HIDDEN) { /* @@ -520,13 +520,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, } else { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); - WARN_ON(ret); + if (ret) + goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); @@ -541,6 +541,30 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_update_readahead(disk); disk_add_events(disk); + return 0; + +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); +out_put_slave_dir: + kobject_put(disk->slave_dir); +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_integrity: + blk_integrity_del(disk); +out_del_block_link: + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(ddev)); +out_device_del: + device_del(ddev); +out_disk_release_events: + disk_release_events(disk); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); + return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ } EXPORT_SYMBOL(device_add_disk); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 55acefdd8a20..c68d83c87f83 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -214,11 +214,11 @@ static inline dev_t disk_devt(struct gendisk *disk) void disk_uevent(struct gendisk *disk, enum kobject_action action); /* block/genhd.c */ -extern void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline void add_disk(struct gendisk *disk) +int device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int add_disk(struct gendisk *disk) { - device_add_disk(NULL, disk, NULL); + return device_add_disk(NULL, disk, NULL); } extern void del_gendisk(struct gendisk *gp); From dbb301f91fc855dccf9bc42fbc4281d89365906d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:41 +0200 Subject: [PATCH 114/129] virtio_blk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Signed-off-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 767b4f72a54d..63dc121a4c43 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -875,9 +875,14 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + if (err) + goto out_cleanup_disk; + return 0; +out_cleanup_disk: + blk_cleanup_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: From 10e7123d5551dec0025f70e61604ab57483a6ed2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:42 +0200 Subject: [PATCH 115/129] null_blk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. The actual cleanup in case of error is already handled by the caller of null_gendisk_register(). Signed-off-by: Luis Chamberlain Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f128242d1170..187d779c8ca0 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1717,8 +1717,7 @@ static int null_gendisk_register(struct nullb *nullb) return ret; } - add_disk(disk); - return 0; + return add_disk(disk); } static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) From 539711d7d6fe382a73254cc966602e63242a6fb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:15 +0200 Subject: [PATCH 116/129] block: remove a pointless call to MINOR() in device_add_disk blk_alloc_ext_minor already returns just a minor number, so no need to mask the high bits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index a925f773145f..f13b7eb0238b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -457,7 +457,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk, if (ret < 0) return ret; disk->major = BLOCK_EXT_MAJOR; - disk->first_minor = MINOR(ret); + disk->first_minor = ret; disk->flags |= GENHD_FL_EXT_DEVT; } From c4b2b7d150d2b155b317b3e2f66492c6befab2b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:16 +0200 Subject: [PATCH 117/129] block: remove CONFIG_DEBUG_BLOCK_EXT_DEVT This might have been a neat debug aid when the extended dev_t was added, but that time is long gone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-3-hch@lst.de Signed-off-by: Jens Axboe --- arch/riscv/configs/defconfig | 1 - arch/riscv/configs/rv32_defconfig | 1 - block/genhd.c | 43 +++---------------------------- init/do_mounts.c | 4 --- lib/Kconfig.debug | 27 ------------------- 5 files changed, 4 insertions(+), 72 deletions(-) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 1f2be234b11c..bc68231a8fb7 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -132,7 +132,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 8dd02b842fef..434ef5b64599 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -127,7 +127,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/block/genhd.c b/block/genhd.c index f13b7eb0238b..6a5b65c86c4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -313,54 +313,19 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) -{ -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) { - if (idx == -ENOSPC) - return -EBUSY; - return idx; - } - return blk_mangle_minor(idx); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } void blk_free_ext_minor(unsigned int minor) { - ida_free(&ext_devt_ida, blk_mangle_minor(minor)); + ida_free(&ext_devt_ida, minor); } static char *bdevt_str(dev_t devt, char *buf) diff --git a/init/do_mounts.c b/init/do_mounts.c index 74aede860de7..b691d6891e51 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -432,10 +432,6 @@ retry: printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " - "explicit textual name for \"root=\" boot option.\n"); -#endif panic("VFS: Unable to mount root fs on %s", b); } if (!(flags & SB_RDONLY)) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5ddd575159fb..22a4aa51bd58 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1679,33 +1679,6 @@ config DEBUG_WQ_FORCE_RR_CPU feature by default. When enabled, memory and cache locality will be impacted. -config DEBUG_BLOCK_EXT_DEVT - bool "Force extended block device numbers and spread them" - depends on DEBUG_KERNEL - depends on BLOCK - default n - help - BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON - SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT - YOU ARE DOING. Distros, please enable this and fix whatever - is broken. - - Conventionally, block device numbers are allocated from - predetermined contiguous area. However, extended block area - may introduce non-contiguous block device numbers. This - option forces most block device numbers to be allocated from - the extended space and spreads them to discover kernel or - userland code paths which assume predetermined contiguous - device number allocation. - - Note that turning on this debug option shuffles all the - device numbers for all IDE and SCSI devices including libata - ones, so root partition specified using device number - directly (via rdev or root=MAJ:MIN) won't work anymore. - Textual device names (root=/dev/sdXn) will continue to work. - - Say N if you are unsure. - config CPU_HOTPLUG_STATE_CONTROL bool "Enable CPU hotplug state control" depends on DEBUG_KERNEL From d9cf3bd531844ffbfe94b16e417037a16efc988d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 19 Jul 2021 11:53:00 +0100 Subject: [PATCH 118/129] bio: fix page leak bio_add_hw_page failure __bio_iov_append_get_pages() doesn't put not appended pages on bio_add_hw_page() failure, so potentially leaking them, fix it. Also, do the same for __bio_iov_iter_get_pages(), even though it looks like it can't be triggered by userspace in this case. Fixes: 0512a75b98f8 ("block: Introduce REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org # 5.8+ Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1edfa6a2ffd66d55e6345a477df5387d2c1415d0.1626653825.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 0c89fa2f7a85..265bff6b549a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -974,6 +974,14 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) return 0; } +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1018,8 +1026,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } __bio_add_page(bio, page, len, offset); } offset = 0; @@ -1064,6 +1074,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); ret = -EINVAL; break; } From 0bdfbca8a623e262e0f343b143151000a300cbaf Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:33 +0300 Subject: [PATCH 119/129] block: Add alternative_gpt_sector() operation Add alternative_gpt_sector() block device operation which specifies alternative location of a GPT entry. This allows us to support Android devices that have GPT entry at a non-standard location and can't be repartitioned easily. Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-2-digetx@gmail.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22b5b8502d2a..c9cb12483e12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1872,6 +1872,13 @@ struct block_device_operations { char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; + + /* + * Special callback for probing GPT entry at a given sector. + * Needed by Android devices, used by GPT scanner and MMC blk + * driver. + */ + int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); }; #ifdef CONFIG_COMPAT From 466d9c4904deb25e2e8dcd29d3a998f4e3fa7c17 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:34 +0300 Subject: [PATCH 120/129] partitions/efi: Support non-standard GPT location Support looking up GPT at a non-standard location specified by a block device driver. Acked-by: Davidlohr Bueso Reviewed-by: Christoph Hellwig Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-3-digetx@gmail.com Signed-off-by: Jens Axboe --- block/partitions/efi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index aaa3dc487cb5..7ca5c4c374d4 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -585,6 +585,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; @@ -619,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; From dc913385dd74e625271482c30aefedd1e5af7b8c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:35 +0300 Subject: [PATCH 121/129] mmc: block: Support alternative_gpt_sector() operation Support generic alternative_gpt_sector() block device operation. It calculates location of GPT entry for eMMC of NVIDIA Tegra Android devices. Add new MMC_CAP2_ALT_GPT_TEGRA flag that enables scanning of alternative GPT sector and add raw_boot_mult field to mmc_ext_csd which allows to get size of the boot partitions that is needed for the calculation. Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-4-digetx@gmail.com Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 21 +++++++++++++++++++++ drivers/mmc/core/core.c | 35 +++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.h | 2 ++ drivers/mmc/core/mmc.c | 2 ++ include/linux/mmc/card.h | 1 + include/linux/mmc/host.h | 1 + 6 files changed, 62 insertions(+) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 4c11f171e56d..6a15fdf6e5f2 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -831,6 +831,26 @@ static int mmc_blk_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif +static int mmc_blk_alternative_gpt_sector(struct gendisk *disk, + sector_t *sector) +{ + struct mmc_blk_data *md; + int ret; + + md = mmc_blk_get(disk); + if (!md) + return -EINVAL; + + if (md->queue.card) + ret = mmc_card_alternative_gpt_sector(md->queue.card, sector); + else + ret = -ENODEV; + + mmc_blk_put(md); + + return ret; +} + static const struct block_device_operations mmc_bdops = { .open = mmc_blk_open, .release = mmc_blk_release, @@ -840,6 +860,7 @@ static const struct block_device_operations mmc_bdops = { #ifdef CONFIG_COMPAT .compat_ioctl = mmc_blk_compat_ioctl, #endif + .alternative_gpt_sector = mmc_blk_alternative_gpt_sector, }; static int mmc_blk_part_switch_pre(struct mmc_card *card, diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 95fedcf56e4a..605f5e8648c1 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2149,6 +2149,41 @@ int mmc_detect_card_removed(struct mmc_host *host) } EXPORT_SYMBOL(mmc_detect_card_removed); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *gpt_sector) +{ + unsigned int boot_sectors_num; + + if ((!(card->host->caps2 & MMC_CAP2_ALT_GPT_TEGRA))) + return -EOPNOTSUPP; + + /* filter out unrelated cards */ + if (card->ext_csd.rev < 3 || + !mmc_card_mmc(card) || + !mmc_card_is_blockaddr(card) || + mmc_card_is_removable(card->host)) + return -ENOENT; + + /* + * eMMC storage has two special boot partitions in addition to the + * main one. NVIDIA's bootloader linearizes eMMC boot0->boot1->main + * accesses, this means that the partition table addresses are shifted + * by the size of boot partitions. In accordance with the eMMC + * specification, the boot partition size is calculated as follows: + * + * boot partition size = 128K byte x BOOT_SIZE_MULT + * + * Calculate number of sectors occupied by the both boot partitions. + */ + boot_sectors_num = card->ext_csd.raw_boot_mult * SZ_128K / + SZ_512 * MMC_NUM_BOOT_PARTITION; + + /* Defined by NVIDIA and used by Android devices. */ + *gpt_sector = card->ext_csd.sectors - boot_sectors_num - 1; + + return 0; +} +EXPORT_SYMBOL(mmc_card_alternative_gpt_sector); + void mmc_rescan(struct work_struct *work) { struct mmc_host *host = diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 0c4de2030b3f..7931a4f0137d 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -119,6 +119,8 @@ void mmc_release_host(struct mmc_host *host); void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx); void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *sector); + /** * mmc_claim_host - exclusively claim a host * @host: mmc host to claim diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 838726b68ff3..29e58ffae379 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -418,6 +418,8 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]; card->ext_csd.raw_hc_erase_grp_size = ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]; + card->ext_csd.raw_boot_mult = + ext_csd[EXT_CSD_BOOT_MULT]; if (card->ext_csd.rev >= 3) { u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT]; card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG]; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 74e6c0624d27..37f975875102 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -109,6 +109,7 @@ struct mmc_ext_csd { u8 raw_hc_erase_gap_size; /* 221 */ u8 raw_erase_timeout_mult; /* 223 */ u8 raw_hc_erase_grp_size; /* 224 */ + u8 raw_boot_mult; /* 226 */ u8 raw_sec_trim_mult; /* 229 */ u8 raw_sec_erase_mult; /* 230 */ u8 raw_sec_feature_support;/* 231 */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0abd47e9ef9b..78dadf86b38f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -398,6 +398,7 @@ struct mmc_host { #else #define MMC_CAP2_CRYPTO 0 #endif +#define MMC_CAP2_ALT_GPT_TEGRA (1 << 28) /* Host with eMMC that has GPT entry at a non-standard location */ int fixed_drv_type; /* fixed driver type for non-removable media */ From 1743fa54c9e8247000e060fcdab406ab3a808223 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:36 +0300 Subject: [PATCH 122/129] mmc: sdhci-tegra: Enable MMC_CAP2_ALT_GPT_TEGRA Tegra20/30/114/124 Android devices place GPT at a non-standard location. Enable GPT entry scanning at that location. Signed-off-by: Dmitry Osipenko Acked-by: Thierry Reding Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-5-digetx@gmail.com Signed-off-by: Jens Axboe --- drivers/mmc/host/sdhci-tegra.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 387ce9cdbd7c..a5001875876b 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -116,6 +116,8 @@ */ #define NVQUIRK_HAS_TMCLK BIT(10) +#define NVQUIRK_HAS_ANDROID_GPT_SECTOR BIT(11) + /* SDMMC CQE Base Address for Tegra Host Ver 4.1 and Higher */ #define SDHCI_TEGRA_CQE_BASE_ADDR 0xF000 @@ -1361,6 +1363,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1390,6 +1393,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_HAS_PADCALIB, }; @@ -1422,6 +1426,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, .dma_mask = DMA_BIT_MASK(32), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1438,6 +1443,7 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, .dma_mask = DMA_BIT_MASK(34), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1616,6 +1622,9 @@ static int sdhci_tegra_probe(struct platform_device *pdev) tegra_host->pad_control_available = false; tegra_host->soc_data = soc_data; + if (soc_data->nvquirks & NVQUIRK_HAS_ANDROID_GPT_SECTOR) + host->mmc->caps2 |= MMC_CAP2_ALT_GPT_TEGRA; + if (soc_data->nvquirks & NVQUIRK_NEEDS_PAD_CONTROL) { rc = tegra_sdhci_init_pinctrl_info(&pdev->dev, tegra_host); if (rc == 0) From 9f2869921f2a102e209297d4f742f34b46ed3d36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 16:43:10 +0200 Subject: [PATCH 123/129] block: refine the disk_live check in del_gendisk hidden gendisks will never be marked live. Fixes: 40b3a52ffc5b ("block: add a sanity check for a live disk in del_gendisk") Reported-by: Bruno Goncalves Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824144310.1487816-1-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 6a5b65c86c4b..567549a011d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -556,7 +556,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk_live(disk))) + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; blk_integrity_del(disk); From 158ee7b65653d9f841823c249014c2d0dfdeeb8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 17:18:23 +0200 Subject: [PATCH 124/129] block: mark blkdev_fsync static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit blkdev_fsync is only used inside of block_dev.c since the removal of the raw drіver. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824151823.1575100-1-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- include/linux/fs.h | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index d3a8062302a0..1f21ac984253 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -687,7 +687,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -708,7 +709,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..9220cdf648b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3246,10 +3246,6 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); -/* fs/block_dev.c */ -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync); - /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); From ead3b768bb51259e3a5f2287ff5fc9041eb6f450 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:18 +0000 Subject: [PATCH 125/129] blk-zoned: allow zone management send operations without CAP_SYS_ADMIN Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE) should be allowed under the same permissions as write(). (write() does not require CAP_SYS_ADMIN). Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if the fd was successfully opened with FMODE_WRITE. (They do not require CAP_SYS_ADMIN). Currently, zone management send operations require both CAP_SYS_ADMIN and that the fd was successfully opened with FMODE_WRITE. Remove the CAP_SYS_ADMIN requirement, so that zone management send operations match the access control requirement of write(), BLKSECDISCARD and BLKZEROOUT. Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-2-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 86fce751bb17..8a60dbeb44be 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -421,9 +421,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; From 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:19 +0000 Subject: [PATCH 126/129] blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN A user space process should not need the CAP_SYS_ADMIN capability set in order to perform a BLKREPORTZONE ioctl. Getting the zone report is required in order to get the write pointer. Neither read() nor write() requires CAP_SYS_ADMIN, so it is reasonable that a user space process that can read/write from/to the device, also can get the write pointer. (Since e.g. writes have to be at the write pointer.) Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-3-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8a60dbeb44be..1d0c76c18fc5 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; From cc40b7225151f611ef837f6403cfaeadc7af214a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 24 Aug 2021 22:59:18 -0700 Subject: [PATCH 127/129] blk-crypto: fix check for too-large dun_bytes dun_bytes needs to be less than or equal to the IV size of the encryption mode, not just less than or equal to BLK_CRYPTO_MAX_IV_SIZE. Currently this doesn't matter since blk_crypto_init_key() is never actually passed invalid values, but we might as well fix this. Fixes: a892c8d52c02 ("block: Inline encryption support for blk-mq") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20210825055918.51975-1-ebiggers@kernel.org Signed-off-by: Jens Axboe --- block/blk-crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-crypto.c b/block/blk-crypto.c index c5bdaafffa29..103c2e2d50d6 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -332,7 +332,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, if (mode->keysize == 0) return -EINVAL; - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) From 1e294970fc00f45c1f17fb442c26a7e3fc9789b1 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 25 Aug 2021 14:19:51 +0800 Subject: [PATCH 128/129] block, bfq: cleanup the repeated declaration Function 'bfq_entity_to_bfqq' is declared twice, so remove the repeated declaration and blank line. Cc: Paolo Valente Cc: Jens Axboe Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1629872391-46399-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 385e28a843d1..a73488eec8a4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,8 +955,6 @@ struct bfq_group { }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ From 1d1cf156dc176e30eeaced5cf1450d582d387b81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 09:54:38 +0200 Subject: [PATCH 129/129] sg: pass the device name to blk_trace_setup Fix a regression that passed a NULL device name to blk_trace_setup accidentally. Fixes: aebbb5831fbd ("sg: do not allocate a gendisk") Reported-by: syzbot+f74aa89114a236643919@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Link: https://lore.kernel.org/r/20210825075438.1883687-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 477267add49d..d5889b4f0fd4 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1118,7 +1118,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: - return blk_trace_setup(sdp->device->request_queue, NULL, + return blk_trace_setup(sdp->device->request_queue, sdp->name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: