OpenCloudOS-Kernel/include/linux/genhd.h

771 lines
23 KiB
C
Raw Normal View History

#ifndef _LINUX_GENHD_H
#define _LINUX_GENHD_H
/*
* genhd.h Copyright (C) 1992 Drew Eckhardt
* Generic hard disk header file by
* Drew Eckhardt
*
* <drew@colorado.edu>
*/
#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
[PATCH] BLOCK: Make it possible to disable the block layer [try #6] Make it possible to disable the block layer. Not all embedded devices require it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require the block layer to be present. This patch does the following: (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev support. (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls an item that uses the block layer. This includes: (*) Block I/O tracing. (*) Disk partition code. (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS. (*) The SCSI layer. As far as I can tell, even SCSI chardevs use the block layer to do scheduling. Some drivers that use SCSI facilities - such as USB storage - end up disabled indirectly from this. (*) Various block-based device drivers, such as IDE and the old CDROM drivers. (*) MTD blockdev handling and FTL. (*) JFFS - which uses set_bdev_super(), something it could avoid doing by taking a leaf out of JFFS2's book. (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is, however, still used in places, and so is still available. (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and parts of linux/fs.h. (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK. (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK. (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK is not enabled. (*) fs/no-block.c is created to hold out-of-line stubs and things that are required when CONFIG_BLOCK is not set: (*) Default blockdev file operations (to give error ENODEV on opening). (*) Makes some /proc changes: (*) /proc/devices does not list any blockdevs. (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK. (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK. (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if given command other than Q_SYNC or if a special device is specified. (*) In init/do_mounts.c, no reference is made to the blockdev routines if CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2. (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return error ENOSYS by way of cond_syscall if so). (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if CONFIG_BLOCK is not set, since they can't then happen. Signed-Off-By: David Howells <dhowells@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2006-10-01 02:45:40 +08:00
#ifdef CONFIG_BLOCK
#define dev_to_disk(device) container_of((device), struct gendisk, part0.__dev)
#define dev_to_part(device) container_of((device), struct hd_struct, __dev)
#define disk_to_dev(disk) (&(disk)->part0.__dev)
#define part_to_dev(part) (&((part)->__dev))
extern struct device_type part_type;
extern struct kobject *block_depr;
extern struct class block_class;
enum {
/* These three have identical behaviour; use the second one if DOS FDISK gets
confused about extended/logical partitions starting past cylinder 1023. */
DOS_EXTENDED_PARTITION = 5,
LINUX_EXTENDED_PARTITION = 0x85,
WIN98_EXTENDED_PARTITION = 0x0f,
SUN_WHOLE_DISK = DOS_EXTENDED_PARTITION,
LINUX_SWAP_PARTITION = 0x82,
LINUX_DATA_PARTITION = 0x83,
LINUX_LVM_PARTITION = 0x8e,
LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */
SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION,
NEW_SOLARIS_X86_PARTITION = 0xbf,
DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */
DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */
DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */
EZD_PARTITION = 0x55, /* EZ-DRIVE */
FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */
OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */
NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */
BSDI_PARTITION = 0xb7, /* BSDI Partition ID */
MINIX_PARTITION = 0x81, /* Minix Partition ID */
UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */
};
#define DISK_MAX_PARTS 256
#define DISK_NAME_LEN 32
#include <linux/major.h>
#include <linux/device.h>
#include <linux/smp.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/workqueue.h>
struct partition {
unsigned char boot_ind; /* 0x80 - active */
unsigned char head; /* starting head */
unsigned char sector; /* starting sector */
unsigned char cyl; /* starting cylinder */
unsigned char sys_ind; /* What partition type */
unsigned char end_head; /* end head */
unsigned char end_sector; /* end sector */
unsigned char end_cyl; /* end cylinder */
__le32 start_sect; /* starting sector counting from 0 */
__le32 nr_sects; /* nr of sectors in partition */
} __attribute__((packed));
struct disk_stats {
unsigned long sectors[2]; /* READs and WRITEs */
unsigned long ios[2];
unsigned long merges[2];
unsigned long ticks[2];
unsigned long io_ticks;
unsigned long time_in_queue;
};
#define PARTITION_META_INFO_VOLNAMELTH 64
/*
* Enough for the string representation of any kind of UUID plus NULL.
* EFI UUID is 36 characters. MSDOS UUID is 11 characters.
*/
#define PARTITION_META_INFO_UUIDLTH 37
struct partition_meta_info {
char uuid[PARTITION_META_INFO_UUIDLTH];
u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};
struct hd_struct {
sector_t start_sect;
/*
* nr_sects is protected by sequence counter. One might extend a
* partition while IO is happening to it and update of nr_sects
* can be non-atomic on 32bit machines with 64bit sector_t.
*/
sector_t nr_sects;
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;
struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
int make_it_fail;
#endif
unsigned long stamp;
atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats dkstats;
#endif
struct percpu_ref ref;
struct rcu_head rcu_head;
};
#define GENHD_FL_REMOVABLE 1
/* 2 is unused */
#define GENHD_FL_MEDIA_CHANGE_NOTIFY 4
#define GENHD_FL_CD 8
#define GENHD_FL_UP 16
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256
#define GENHD_FL_NO_PART_SCAN 512
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
enum {
DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
};
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
struct blk_scsi_cmd_filter {
unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
struct kobject kobj;
};
struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
struct hd_struct __rcu *last_lookup;
struct hd_struct __rcu *part[];
};
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
struct disk_events;
struct badblocks;
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct blk_integrity {
struct blk_integrity_profile *profile;
unsigned char flags;
unsigned char tuple_size;
unsigned char interval_exp;
unsigned char tag_size;
};
#endif /* CONFIG_BLK_DEV_INTEGRITY */
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
int flags;
struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
{
if (likely(part)) {
if (part->partno)
return dev_to_disk(part_to_dev(part)->parent);
else
return dev_to_disk(part_to_dev(part));
}
return NULL;
}
static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
{
int i;
for (i = 0; i < 16; ++i) {
*to++ = (hex_to_bin(*uuid_str) << 4) |
(hex_to_bin(*(uuid_str + 1)));
uuid_str += 2;
switch (i) {
case 3:
case 5:
case 7:
case 9:
uuid_str++;
continue;
}
}
}
static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
{
part_pack_uuid(uuid_str, to);
return 0;
}
static inline int disk_max_parts(struct gendisk *disk)
{
if (disk->flags & GENHD_FL_EXT_DEVT)
return DISK_MAX_PARTS;
return disk->minors;
}
static inline bool disk_part_scan_enabled(struct gendisk *disk)
{
return disk_max_parts(disk) > 1 &&
!(disk->flags & GENHD_FL_NO_PART_SCAN);
}
static inline dev_t disk_devt(struct gendisk *disk)
{
return disk_to_dev(disk)->devt;
}
static inline dev_t part_devt(struct hd_struct *part)
{
return part_to_dev(part)->devt;
}
extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
static inline void disk_put_part(struct hd_struct *part)
{
if (likely(part))
put_device(part_to_dev(part));
}
/*
* Smarter partition iterator without context limits.
*/
#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */
#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */
#define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */
#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
struct disk_part_iter {
struct gendisk *disk;
struct hd_struct *part;
int idx;
unsigned int flags;
};
extern void disk_part_iter_init(struct disk_part_iter *piter,
struct gendisk *disk, unsigned int flags);
extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
extern void disk_part_iter_exit(struct disk_part_iter *piter);
extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
sector_t sector);
/*
* Macros to operate on percpu disk statistics:
*
* {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters
* and should be called between disk_stat_lock() and
* disk_stat_unlock().
*
* part_stat_read() can be called at any time.
*
* part_stat_{add|set_all}() and {init|free}_part_stats are for
* internal use only.
*/
#ifdef CONFIG_SMP
#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
#define __part_stat_add(cpu, part, field, addnd) \
(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
#define part_stat_read(part, field) \
({ \
typeof((part)->dkstats->field) res = 0; \
unsigned int _cpu; \
for_each_possible_cpu(_cpu) \
res += per_cpu_ptr((part)->dkstats, _cpu)->field; \
res; \
})
static inline void part_stat_set_all(struct hd_struct *part, int value)
{
int i;
for_each_possible_cpu(i)
memset(per_cpu_ptr(part->dkstats, i), value,
sizeof(struct disk_stats));
}
static inline int init_part_stats(struct hd_struct *part)
{
part->dkstats = alloc_percpu(struct disk_stats);
if (!part->dkstats)
return 0;
return 1;
}
static inline void free_part_stats(struct hd_struct *part)
{
free_percpu(part->dkstats);
}
#else /* !CONFIG_SMP */
#define part_stat_lock() ({ rcu_read_lock(); 0; })
#define part_stat_unlock() rcu_read_unlock()
#define __part_stat_add(cpu, part, field, addnd) \
((part)->dkstats.field += addnd)
#define part_stat_read(part, field) ((part)->dkstats.field)
static inline void part_stat_set_all(struct hd_struct *part, int value)
{
memset(&part->dkstats, value, sizeof(struct disk_stats));
}
static inline int init_part_stats(struct hd_struct *part)
{
return 1;
}
static inline void free_part_stats(struct hd_struct *part)
{
}
#endif /* CONFIG_SMP */
#define part_stat_add(cpu, part, field, addnd) do { \
__part_stat_add((cpu), (part), field, addnd); \
if ((part)->partno) \
__part_stat_add((cpu), &part_to_disk((part))->part0, \
field, addnd); \
} while (0)
#define part_stat_dec(cpu, gendiskp, field) \
part_stat_add(cpu, gendiskp, field, -1)
#define part_stat_inc(cpu, gendiskp, field) \
part_stat_add(cpu, gendiskp, field, 1)
#define part_stat_sub(cpu, gendiskp, field, subnd) \
part_stat_add(cpu, gendiskp, field, -subnd)
static inline void part_inc_in_flight(struct hd_struct *part, int rw)
{
atomic_inc(&part->in_flight[rw]);
if (part->partno)
atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
}
static inline void part_dec_in_flight(struct hd_struct *part, int rw)
{
atomic_dec(&part->in_flight[rw]);
if (part->partno)
atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
}
static inline int part_in_flight(struct hd_struct *part)
{
return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
}
static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
{
if (disk)
return kzalloc_node(sizeof(struct partition_meta_info),
GFP_KERNEL, disk->node_id);
return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
}
static inline void free_part_info(struct hd_struct *part)
{
kfree(part->info);
}
/* block/blk-core.c */
extern void part_round_stats(int cpu, struct hd_struct *part);
/* block/genhd.c */
extern void add_disk(struct gendisk *disk);
extern void del_gendisk(struct gendisk *gp);
extern struct gendisk *get_gendisk(dev_t dev, int *partno);
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
extern void set_device_ro(struct block_device *bdev, int flag);
extern void set_disk_ro(struct gendisk *disk, int flag);
static inline int get_disk_ro(struct gendisk *disk)
{
return disk->part0.policy;
}
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
2011-07-01 22:17:47 +08:00
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 03:57:37 +08:00
extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);
static inline sector_t get_start_sect(struct block_device *bdev)
{
return bdev->bd_part->start_sect;
}
static inline sector_t get_capacity(struct gendisk *disk)
{
return disk->part0.nr_sects;
}
static inline void set_capacity(struct gendisk *disk, sector_t size)
{
disk->part0.nr_sects = size;
}
#ifdef CONFIG_SOLARIS_X86_PARTITION
#define SOLARIS_X86_NUMSLICE 16
#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL)
struct solaris_x86_slice {
__le16 s_tag; /* ID tag of partition */
__le16 s_flag; /* permission flags */
__le32 s_start; /* start sector no of partition */
__le32 s_size; /* # of blocks in partition */
};
struct solaris_x86_vtoc {
unsigned int v_bootinfo[3]; /* info needed by mboot (unsupported) */
__le32 v_sanity; /* to verify vtoc sanity */
__le32 v_version; /* layout version */
char v_volume[8]; /* volume name */
__le16 v_sectorsz; /* sector size in bytes */
__le16 v_nparts; /* number of partitions */
unsigned int v_reserved[10]; /* free space */
struct solaris_x86_slice
v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */
unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp (unsupported) */
char v_asciilabel[128]; /* for compatibility */
};
#endif /* CONFIG_SOLARIS_X86_PARTITION */
#ifdef CONFIG_BSD_DISKLABEL
/*
* BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il>
* updated by Marc Espie <Marc.Espie@openbsd.org>
*/
/* check against BSD src/sys/sys/disklabel.h for consistency */
#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */
#define BSD_MAXPARTITIONS 16
#define OPENBSD_MAXPARTITIONS 16
#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */
struct bsd_disklabel {
__le32 d_magic; /* the magic number */
__s16 d_type; /* drive type */
__s16 d_subtype; /* controller/d_type specific */
char d_typename[16]; /* type name, e.g. "eagle" */
char d_packname[16]; /* pack identifier */
__u32 d_secsize; /* # of bytes per sector */
__u32 d_nsectors; /* # of data sectors per track */
__u32 d_ntracks; /* # of tracks per cylinder */
__u32 d_ncylinders; /* # of data cylinders per unit */
__u32 d_secpercyl; /* # of data sectors per cylinder */
__u32 d_secperunit; /* # of data sectors per unit */
__u16 d_sparespertrack; /* # of spare sectors per track */
__u16 d_sparespercyl; /* # of spare sectors per cylinder */
__u32 d_acylinders; /* # of alt. cylinders per unit */
__u16 d_rpm; /* rotational speed */
__u16 d_interleave; /* hardware sector interleave */
__u16 d_trackskew; /* sector 0 skew, per track */
__u16 d_cylskew; /* sector 0 skew, per cylinder */
__u32 d_headswitch; /* head switch time, usec */
__u32 d_trkseek; /* track-to-track seek, usec */
__u32 d_flags; /* generic flags */
#define NDDATA 5
__u32 d_drivedata[NDDATA]; /* drive-type specific information */
#define NSPARE 5
__u32 d_spare[NSPARE]; /* reserved for future use */
__le32 d_magic2; /* the magic number (again) */
__le16 d_checksum; /* xor of data incl. partitions */
/* filesystem and partition information: */
__le16 d_npartitions; /* number of partitions in following */
__le32 d_bbsize; /* size of boot area at sn0, bytes */
__le32 d_sbsize; /* max size of fs superblock, bytes */
struct bsd_partition { /* the partition table */
__le32 p_size; /* number of sectors in partition */
__le32 p_offset; /* starting sector */
__le32 p_fsize; /* filesystem basic fragment size */
__u8 p_fstype; /* filesystem type, see below */
__u8 p_frag; /* filesystem fragments per block */
__le16 p_cpg; /* filesystem cylinders per group */
} d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */
};
#endif /* CONFIG_BSD_DISKLABEL */
#ifdef CONFIG_UNIXWARE_DISKLABEL
/*
* Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl>
* and Krzysztof G. Baranowski <kgb@knm.org.pl>
*/
#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */
#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */
#define UNIXWARE_NUMSLICE 16
#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */
struct unixware_slice {
__le16 s_label; /* label */
__le16 s_flags; /* permission flags */
__le32 start_sect; /* starting sector */
__le32 nr_sects; /* number of sectors in slice */
};
struct unixware_disklabel {
__le32 d_type; /* drive type */
__le32 d_magic; /* the magic number */
__le32 d_version; /* version number */
char d_serial[12]; /* serial number of the device */
__le32 d_ncylinders; /* # of data cylinders per device */
__le32 d_ntracks; /* # of tracks per cylinder */
__le32 d_nsectors; /* # of data sectors per track */
__le32 d_secsize; /* # of bytes per sector */
__le32 d_part_start; /* # of first sector of this partition */
__le32 d_unknown1[12]; /* ? */
__le32 d_alt_tbl; /* byte offset of alternate table */
__le32 d_alt_len; /* byte length of alternate table */
__le32 d_phys_cyl; /* # of physical cylinders per device */
__le32 d_phys_trk; /* # of physical tracks per cylinder */
__le32 d_phys_sec; /* # of physical sectors per track */
__le32 d_phys_bytes; /* # of physical bytes per sector */
__le32 d_unknown2; /* ? */
__le32 d_unknown3; /* ? */
__le32 d_pad[8]; /* pad */
struct unixware_vtoc {
__le32 v_magic; /* the magic number */
__le32 v_version; /* version number */
char v_name[8]; /* volume name */
__le16 v_nslices; /* # of slices */
__le16 v_unknown1; /* ? */
__le32 v_reserved[10]; /* reserved */
struct unixware_slice
v_slice[UNIXWARE_NUMSLICE]; /* slice headers */
} vtoc;
}; /* 408 */
#endif /* CONFIG_UNIXWARE_DISKLABEL */
#ifdef CONFIG_MINIX_SUBPARTITION
# define MINIX_NR_SUBPARTITIONS 4
#endif /* CONFIG_MINIX_SUBPARTITION */
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
extern void blk_free_devt(dev_t devt);
extern dev_t blk_lookup_devt(const char *name, int partno);
extern char *disk_name (struct gendisk *hd, int partno, char *buf);
extern int disk_expand_part_tbl(struct gendisk *disk, int target);
extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev);
extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
int partno, sector_t start,
sector_t len, int flags,
struct partition_meta_info
*info);
extern void __delete_partition(struct percpu_ref *);
extern void delete_partition(struct gendisk *, int);
extern void printk_all_partitions(void);
extern struct gendisk *alloc_disk_node(int minors, int node_id);
extern struct gendisk *alloc_disk(int minors);
extern struct kobject *get_disk(struct gendisk *disk);
extern void put_disk(struct gendisk *disk);
extern void blk_register_region(dev_t devt, unsigned long range,
struct module *module,
struct kobject *(*probe)(dev_t, int *, void *),
int (*lock)(dev_t, void *),
void *data);
extern void blk_unregister_region(dev_t devt, unsigned long range);
extern ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t part_inflight_show(struct device *dev,
struct device_attribute *attr, char *buf);
#ifdef CONFIG_FAIL_MAKE_REQUEST
extern ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t part_fail_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count);
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline int hd_ref_init(struct hd_struct *part)
{
if (percpu_ref_init(&part->ref, __delete_partition, 0,
GFP_KERNEL))
return -ENOMEM;
return 0;
}
static inline void hd_struct_get(struct hd_struct *part)
{
percpu_ref_get(&part->ref);
}
static inline int hd_struct_try_get(struct hd_struct *part)
{
return percpu_ref_tryget_live(&part->ref);
}
static inline void hd_struct_put(struct hd_struct *part)
{
percpu_ref_put(&part->ref);
}
static inline void hd_struct_kill(struct hd_struct *part)
{
percpu_ref_kill(&part->ref);
}
static inline void hd_free_part(struct hd_struct *part)
{
free_part_stats(part);
free_part_info(part);
percpu_ref_exit(&part->ref);
}
/*
* Any access of part->nr_sects which is not protected by partition
* bd_mutex or gendisk bdev bd_mutex, should be done using this
* accessor function.
*
* Code written along the lines of i_size_read() and i_size_write().
* CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
* on.
*/
static inline sector_t part_nr_sects_read(struct hd_struct *part)
{
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
sector_t nr_sects;
unsigned seq;
do {
seq = read_seqcount_begin(&part->nr_sects_seq);
nr_sects = part->nr_sects;
} while (read_seqcount_retry(&part->nr_sects_seq, seq));
return nr_sects;
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
sector_t nr_sects;
preempt_disable();
nr_sects = part->nr_sects;
preempt_enable();
return nr_sects;
#else
return part->nr_sects;
#endif
}
/*
* Should be called with mutex lock held (typically bd_mutex) of partition
* to provide mutual exlusion among writers otherwise seqcount might be
* left in wrong state leaving the readers spinning infinitely.
*/
static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
write_seqcount_begin(&part->nr_sects_seq);
part->nr_sects = size;
write_seqcount_end(&part->nr_sects_seq);
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
preempt_disable();
part->nr_sects = size;
preempt_enable();
#else
part->nr_sects = size;
#endif
}
#if defined(CONFIG_BLK_DEV_INTEGRITY)
extern void blk_integrity_add(struct gendisk *);
extern void blk_integrity_del(struct gendisk *);
extern void blk_integrity_revalidate(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline void blk_integrity_add(struct gendisk *disk) { }
static inline void blk_integrity_del(struct gendisk *disk) { }
static inline void blk_integrity_revalidate(struct gendisk *disk) { }
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#else /* CONFIG_BLOCK */
static inline void printk_all_partitions(void) { }
static inline dev_t blk_lookup_devt(const char *name, int partno)
{
dev_t devt = MKDEV(0, 0);
return devt;
}
static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
{
return -EINVAL;
}
#endif /* CONFIG_BLOCK */
[PATCH] BLOCK: Make it possible to disable the block layer [try #6] Make it possible to disable the block layer. Not all embedded devices require it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require the block layer to be present. This patch does the following: (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev support. (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls an item that uses the block layer. This includes: (*) Block I/O tracing. (*) Disk partition code. (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS. (*) The SCSI layer. As far as I can tell, even SCSI chardevs use the block layer to do scheduling. Some drivers that use SCSI facilities - such as USB storage - end up disabled indirectly from this. (*) Various block-based device drivers, such as IDE and the old CDROM drivers. (*) MTD blockdev handling and FTL. (*) JFFS - which uses set_bdev_super(), something it could avoid doing by taking a leaf out of JFFS2's book. (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is, however, still used in places, and so is still available. (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and parts of linux/fs.h. (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK. (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK. (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK is not enabled. (*) fs/no-block.c is created to hold out-of-line stubs and things that are required when CONFIG_BLOCK is not set: (*) Default blockdev file operations (to give error ENODEV on opening). (*) Makes some /proc changes: (*) /proc/devices does not list any blockdevs. (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK. (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK. (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if given command other than Q_SYNC or if a special device is specified. (*) In init/do_mounts.c, no reference is made to the blockdev routines if CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2. (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return error ENOSYS by way of cond_syscall if so). (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if CONFIG_BLOCK is not set, since they can't then happen. Signed-Off-By: David Howells <dhowells@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2006-10-01 02:45:40 +08:00
#endif /* _LINUX_GENHD_H */