OpenCloudOS-Kernel/block/disk-events.c

494 lines
13 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Disk events - monitor disk events like media change and eject request.
*/
#include <linux/export.h>
#include <linux/moduleparam.h>
#include <linux/blkdev.h>
#include "blk.h"
struct disk_events {
struct list_head node; /* all disk_event's */
struct gendisk *disk; /* the associated disk */
spinlock_t lock;
struct mutex block_mutex; /* protects blocking */
int block; /* event blocking depth */
unsigned int pending; /* events already sent out */
unsigned int clearing; /* events being cleared */
long poll_msecs; /* interval, -1 for default */
struct delayed_work dwork;
};
static const char *disk_events_strs[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
};
static char *disk_uevents[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
};
/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
long intv_msecs = 0;
/*
* If device-specific poll interval is set, always use it. If
* the default is being used, poll if the POLL flag is set.
*/
if (ev->poll_msecs >= 0)
intv_msecs = ev->poll_msecs;
else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
intv_msecs = disk_events_dfl_poll_msecs;
return msecs_to_jiffies(intv_msecs);
}
/**
* disk_block_events - block and flush disk event checking
* @disk: disk to block events for
*
* On return from this function, it is guaranteed that event checking
* isn't in progress and won't happen until unblocked by
* disk_unblock_events(). Events blocking is counted and the actual
* unblocking happens after the matching number of unblocks are done.
*
* Note that this intentionally does not block event checking from
* disk_clear_events().
*
* CONTEXT:
* Might sleep.
*/
void disk_block_events(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
unsigned long flags;
bool cancel;
if (!ev)
return;
/*
* Outer mutex ensures that the first blocker completes canceling
* the event work before further blockers are allowed to finish.
*/
mutex_lock(&ev->block_mutex);
spin_lock_irqsave(&ev->lock, flags);
cancel = !ev->block++;
spin_unlock_irqrestore(&ev->lock, flags);
if (cancel)
cancel_delayed_work_sync(&disk->ev->dwork);
mutex_unlock(&ev->block_mutex);
}
static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
struct disk_events *ev = disk->ev;
unsigned long intv;
unsigned long flags;
spin_lock_irqsave(&ev->lock, flags);
if (WARN_ON_ONCE(ev->block <= 0))
goto out_unlock;
if (--ev->block)
goto out_unlock;
intv = disk_events_poll_jiffies(disk);
if (check_now)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
else if (intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
/**
* disk_unblock_events - unblock disk event checking
* @disk: disk to unblock events for
*
* Undo disk_block_events(). When the block count reaches zero, it
* starts events polling if configured.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_unblock_events(struct gendisk *disk)
{
if (disk->ev)
__disk_unblock_events(disk, false);
}
/**
* disk_flush_events - schedule immediate event checking and flushing
* @disk: disk to check and flush events for
* @mask: events to flush
*
* Schedule immediate event checking on @disk if not blocked. Events in
* @mask are scheduled to be cleared from the driver. Note that this
* doesn't clear the events from @disk->ev.
*
* CONTEXT:
* If @mask is non-zero must be called with disk->open_mutex held.
*/
void disk_flush_events(struct gendisk *disk, unsigned int mask)
{
struct disk_events *ev = disk->ev;
if (!ev)
return;
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
if (!ev->block)
mod_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
spin_unlock_irq(&ev->lock);
}
/*
* Tell userland about new events. Only the events listed in @disk->events are
* reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are
* processed internally but never get reported to userland.
*/
static void disk_event_uevent(struct gendisk *disk, unsigned int events)
{
char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
int nr_events = 0, i;
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
if (events & disk->events & (1 << i))
envp[nr_events++] = disk_uevents[i];
if (nr_events)
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}
static void disk_check_events(struct disk_events *ev,
unsigned int *clearing_ptr)
{
struct gendisk *disk = ev->disk;
unsigned int clearing = *clearing_ptr;
unsigned int events;
unsigned long intv;
/* check events */
events = disk->fops->check_events(disk, clearing);
/* accumulate pending events and schedule next poll if necessary */
spin_lock_irq(&ev->lock);
events &= ~ev->pending;
ev->pending |= events;
*clearing_ptr &= ~clearing;
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
spin_unlock_irq(&ev->lock);
block: add disk sequence number Associating uevents with block devices in userspace is difficult and racy: the uevent netlink socket is lossy, and on slow and overloaded systems has a very high latency. Block devices do not have exclusive owners in userspace, any process can set one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 can be reused again and again). A userspace process setting up a block device and watching for its events cannot thus reliably tell whether an event relates to the device it just set up or another earlier instance with the same name. Being able to set a UUID on a loop device would solve the race conditions. But it does not allow to derive orderings from uevents: if you see a uevent with a UUID that does not match the device you are waiting for, you cannot tell whether it's because the right uevent has not arrived yet, or it was already sent and you missed it. So you cannot tell whether you should wait for it or not. Associating a unique, monotonically increasing sequential number to the lifetime of each block device, which can be retrieved with an ioctl immediately upon setting it up, allows to solve the race conditions with uevents, and also allows userspace processes to know whether they should wait for the uevent they need or if it was dropped and thus they should move on. Additionally, increment the disk sequence number when the media change, i.e. on DISK_EVENT_MEDIA_CHANGE event. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Matteo Croce <mcroce@microsoft.com> Tested-by: Luca Boccassi <bluca@debian.org> Link: https://lore.kernel.org/r/20210712230530.29323-2-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-07-13 07:05:25 +08:00
if (events & DISK_EVENT_MEDIA_CHANGE)
inc_diskseq(disk);
if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
disk_event_uevent(disk, events);
}
/**
* disk_clear_events - synchronously check, clear and return pending events
* @disk: disk to fetch and clear events from
* @mask: mask of events to be fetched and cleared
*
* Disk events are synchronously checked and pending events in @mask
* are cleared and returned. This ignores the block count.
*
* CONTEXT:
* Might sleep.
*/
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
struct disk_events *ev = disk->ev;
unsigned int pending;
unsigned int clearing = mask;
if (!ev)
return 0;
disk_block_events(disk);
/*
* store the union of mask and ev->clearing on the stack so that the
* race with disk_flush_events does not cause ambiguity (ev->clearing
* can still be modified even if events are blocked).
*/
spin_lock_irq(&ev->lock);
clearing |= ev->clearing;
ev->clearing = 0;
spin_unlock_irq(&ev->lock);
disk_check_events(ev, &clearing);
/*
* if ev->clearing is not 0, the disk_flush_events got called in the
* middle of this function, so we want to run the workfn without delay.
*/
__disk_unblock_events(disk, ev->clearing ? true : false);
/* then, fetch and clear pending events */
spin_lock_irq(&ev->lock);
pending = ev->pending & mask;
ev->pending &= ~mask;
spin_unlock_irq(&ev->lock);
WARN_ON_ONCE(clearing & mask);
return pending;
}
/**
* disk_check_media_change - check if a removable media has been changed
* @disk: gendisk to check
*
* Check whether a removable media has been changed, and attempt to free all
* dentries and inodes and invalidates all block device page cache entries in
* that case.
*
* Returns %true if the media has changed, or %false if not.
*/
bool disk_check_media_change(struct gendisk *disk)
{
unsigned int events;
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
bdev_mark_dead(disk->part0, true);
set_bit(GD_NEED_PART_SCAN, &disk->state);
return true;
}
EXPORT_SYMBOL(disk_check_media_change);
/**
* disk_force_media_change - force a media change event
* @disk: the disk which will raise the event
*
* Should be called when the media changes for @disk. Generates a uevent
* and attempts to free all dentries and inodes and invalidates all block
* device page cache entries in that case.
*/
void disk_force_media_change(struct gendisk *disk)
{
disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
block: increment diskseq on all media change events Currently, associating a loop device with a different file descriptor does not increment its diskseq. This allows the following race condition: 1. Program X opens a loop device 2. Program X gets the diskseq of the loop device. 3. Program X associates a file with the loop device. 4. Program X passes the loop device major, minor, and diskseq to something. 5. Program X exits. 6. Program Y detaches the file from the loop device. 7. Program Y attaches a different file to the loop device. 8. The opener finally gets around to opening the loop device and checks that the diskseq is what it expects it to be. Even though the diskseq is the expected value, the result is that the opener is accessing the wrong file. From discussions with Christoph Hellwig, it appears that disk_force_media_change() was supposed to call inc_diskseq(), but in fact it does not. Adding a Fixes: tag to indicate this. Christoph's Reported-by is because he stated that disk_force_media_change() calls inc_diskseq(), which is what led me to discover that it should but does not. Reported-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com> Fixes: e6138dc12de9 ("block: add a helper to raise a media changed event") Cc: stable@vger.kernel.org # 5.15+ Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20230607170837.1559-1-demi@invisiblethingslab.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-06-08 01:08:37 +08:00
inc_diskseq(disk);
bdev_mark_dead(disk->part0, true);
set_bit(GD_NEED_PART_SCAN, &disk->state);
}
EXPORT_SYMBOL_GPL(disk_force_media_change);
/*
* Separate this part out so that a different pointer for clearing_ptr can be
* passed in for disk_clear_events.
*/
static void disk_events_workfn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
disk_check_events(ev, &ev->clearing);
}
/*
* A disk events enabled device has the following sysfs nodes under
* its /sys/block/X/ directory.
*
* events : list of all supported events
* events_async : list of events which can be detected w/o polling
* (always empty, only for backwards compatibility)
* events_poll_msecs : polling interval, 0: disable, -1: system default
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
const char *delim = "";
ssize_t pos = 0;
int i;
for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
if (events & (1 << i)) {
pos += sprintf(buf + pos, "%s%s",
delim, disk_events_strs[i]);
delim = " ";
}
if (pos)
pos += sprintf(buf + pos, "\n");
return pos;
}
static ssize_t disk_events_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
return 0;
return __disk_events_show(disk->events, buf);
}
static ssize_t disk_events_async_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return 0;
}
static ssize_t disk_events_poll_msecs_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->ev)
return sprintf(buf, "-1\n");
return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}
static ssize_t disk_events_poll_msecs_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
long intv;
if (!count || !sscanf(buf, "%ld", &intv))
return -EINVAL;
if (intv < 0 && intv != -1)
return -EINVAL;
if (!disk->ev)
return -ENODEV;
disk_block_events(disk);
disk->ev->poll_msecs = intv;
__disk_unblock_events(disk, true);
return count;
}
DEVICE_ATTR(events, 0444, disk_events_show, NULL);
DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
/*
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
* /sys/module/block/parameters/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)
{
struct disk_events *ev;
int ret;
ret = param_set_ulong(val, kp);
if (ret < 0)
return ret;
mutex_lock(&disk_events_mutex);
list_for_each_entry(ev, &disk_events, node)
disk_flush_events(ev->disk, 0);
mutex_unlock(&disk_events_mutex);
return 0;
}
static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
.set = disk_events_set_dfl_poll_msecs,
.get = param_get_ulong,
};
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "block."
module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
&disk_events_dfl_poll_msecs, 0644);
/*
* disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
*/
int disk_alloc_events(struct gendisk *disk)
{
struct disk_events *ev;
if (!disk->fops->check_events || !disk->events)
return 0;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev) {
pr_warn("%s: failed to initialize events\n", disk->disk_name);
return -ENOMEM;
}
INIT_LIST_HEAD(&ev->node);
ev->disk = disk;
spin_lock_init(&ev->lock);
mutex_init(&ev->block_mutex);
ev->block = 1;
ev->poll_msecs = -1;
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
disk->ev = ev;
return 0;
}
void disk_add_events(struct gendisk *disk)
{
if (!disk->ev)
return;
mutex_lock(&disk_events_mutex);
list_add_tail(&disk->ev->node, &disk_events);
mutex_unlock(&disk_events_mutex);
/*
* Block count is initialized to 1 and the following initial
* unblock kicks it into action.
*/
__disk_unblock_events(disk, true);
}
void disk_del_events(struct gendisk *disk)
{
if (disk->ev) {
disk_block_events(disk);
mutex_lock(&disk_events_mutex);
list_del_init(&disk->ev->node);
mutex_unlock(&disk_events_mutex);
}
}
void disk_release_events(struct gendisk *disk)
{
/* the block count should be 1 from disk_del_events() */
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}