Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus
This commit is contained in:
commit
c58310bf49
|
@ -14,6 +14,7 @@ Following translations are available on the WWW:
|
|||
- this file.
|
||||
ABI/
|
||||
- info on kernel <-> userspace ABI and relative interface stability.
|
||||
|
||||
BUG-HUNTING
|
||||
- brute force method of doing binary search of patches to find bug.
|
||||
Changes
|
||||
|
@ -66,6 +67,8 @@ VGA-softcursor.txt
|
|||
- how to change your VGA cursor from a blinking underscore.
|
||||
accounting/
|
||||
- documentation on accounting and taskstats.
|
||||
acpi/
|
||||
- info on ACPI-specific hooks in the kernel.
|
||||
aoe/
|
||||
- description of AoE (ATA over Ethernet) along with config examples.
|
||||
applying-patches.txt
|
||||
|
@ -106,6 +109,8 @@ cpu-hotplug.txt
|
|||
- document describing CPU hotplug support in the Linux kernel.
|
||||
cpu-load.txt
|
||||
- document describing how CPU load statistics are collected.
|
||||
cpuidle/
|
||||
- info on CPU_IDLE, CPU idle state management subsystem.
|
||||
cpusets.txt
|
||||
- documents the cpusets feature; assign CPUs and Mem to a set of tasks.
|
||||
cputopology.txt
|
||||
|
@ -126,18 +131,16 @@ devices.txt
|
|||
- plain ASCII listing of all the nodes in /dev/ with major minor #'s.
|
||||
digiepca.txt
|
||||
- info on Digi Intl. {PC,PCI,EISA}Xx and Xem series cards.
|
||||
dnotify.txt
|
||||
- info about directory notification in Linux.
|
||||
dontdiff
|
||||
- file containing a list of files that should never be diff'ed.
|
||||
driver-model/
|
||||
- directory with info about Linux driver model.
|
||||
drivers/
|
||||
- directory with driver documentation (currently only EDAC).
|
||||
dvb/
|
||||
- info on Linux Digital Video Broadcast (DVB) subsystem.
|
||||
early-userspace/
|
||||
- info about initramfs, klibc, and userspace early during boot.
|
||||
edac.txt
|
||||
- information on EDAC - Error Detection And Correction
|
||||
eisa.txt
|
||||
- info on EISA bus support.
|
||||
exception.txt
|
||||
|
@ -226,6 +229,8 @@ kref.txt
|
|||
- docs on adding reference counters (krefs) to kernel objects.
|
||||
laptop-mode.txt
|
||||
- how to conserve battery power using laptop-mode.
|
||||
laptops/
|
||||
- directory with laptop related info and laptop driver documentation.
|
||||
ldm.txt
|
||||
- a brief description of LDM (Windows Dynamic Disks).
|
||||
leds-class.txt
|
||||
|
@ -334,20 +339,8 @@ rtc.txt
|
|||
- notes on how to use the Real Time Clock (aka CMOS clock) driver.
|
||||
s390/
|
||||
- directory with info on using Linux on the IBM S390.
|
||||
sched-arch.txt
|
||||
- CPU Scheduler implementation hints for architecture specific code.
|
||||
sched-coding.txt
|
||||
- reference for various scheduler-related methods in the O(1) scheduler.
|
||||
sched-design.txt
|
||||
- goals, design and implementation of the Linux O(1) scheduler.
|
||||
sched-design-CFS.txt
|
||||
- goals, design and implementation of the Complete Fair Scheduler.
|
||||
sched-domains.txt
|
||||
- information on scheduling domains.
|
||||
sched-nice-design.txt
|
||||
- How and why the scheduler's nice levels are implemented.
|
||||
sched-stats.txt
|
||||
- information on schedstats (Linux Scheduler Statistics).
|
||||
scheduler/
|
||||
- directory with info on the scheduler.
|
||||
scsi/
|
||||
- directory with info on Linux scsi support.
|
||||
serial/
|
||||
|
@ -360,14 +353,8 @@ sgi-visws.txt
|
|||
- short blurb on the SGI Visual Workstations.
|
||||
sh/
|
||||
- directory with info on porting Linux to a new architecture.
|
||||
sharedsubtree.txt
|
||||
- a description of shared subtrees for namespaces.
|
||||
smart-config.txt
|
||||
- description of the Smart Config makefile feature.
|
||||
sony-laptop.txt
|
||||
- Sony Notebook Control Driver (SNC) Readme.
|
||||
sonypi.txt
|
||||
- info on Linux Sony Programmable I/O Device support.
|
||||
sound/
|
||||
- directory with info on sound card support.
|
||||
sparc/
|
||||
|
@ -398,8 +385,6 @@ sysrq.txt
|
|||
- info on the magic SysRq key.
|
||||
telephony/
|
||||
- directory with info on telephony (e.g. voice over IP) support.
|
||||
thinkpad-acpi.txt
|
||||
- information on the (IBM and Lenovo) ThinkPad ACPI Extras driver.
|
||||
time_interpolators.txt
|
||||
- info on time interpolators.
|
||||
tipar.txt
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
What: /proc/diskstats
|
||||
Date: February 2008
|
||||
Contact: Jerome Marchand <jmarchan@redhat.com>
|
||||
Description:
|
||||
The /proc/diskstats file displays the I/O statistics
|
||||
of block devices. Each line contains the following 14
|
||||
fields:
|
||||
1 - major number
|
||||
2 - minor mumber
|
||||
3 - device name
|
||||
4 - reads completed succesfully
|
||||
5 - reads merged
|
||||
6 - sectors read
|
||||
7 - time spent reading (ms)
|
||||
8 - writes completed
|
||||
9 - writes merged
|
||||
10 - sectors written
|
||||
11 - time spent writing (ms)
|
||||
12 - I/Os currently in progress
|
||||
13 - time spent doing I/Os (ms)
|
||||
14 - weighted time spent doing I/Os (ms)
|
||||
For more details refer to Documentation/iostats.txt
|
|
@ -0,0 +1,28 @@
|
|||
What: /sys/block/<disk>/stat
|
||||
Date: February 2008
|
||||
Contact: Jerome Marchand <jmarchan@redhat.com>
|
||||
Description:
|
||||
The /sys/block/<disk>/stat files displays the I/O
|
||||
statistics of disk <disk>. They contain 11 fields:
|
||||
1 - reads completed succesfully
|
||||
2 - reads merged
|
||||
3 - sectors read
|
||||
4 - time spent reading (ms)
|
||||
5 - writes completed
|
||||
6 - writes merged
|
||||
7 - sectors written
|
||||
8 - time spent writing (ms)
|
||||
9 - I/Os currently in progress
|
||||
10 - time spent doing I/Os (ms)
|
||||
11 - weighted time spent doing I/Os (ms)
|
||||
For more details refer Documentation/iostats.txt
|
||||
|
||||
|
||||
What: /sys/block/<disk>/<part>/stat
|
||||
Date: February 2008
|
||||
Contact: Jerome Marchand <jmarchan@redhat.com>
|
||||
Description:
|
||||
The /sys/block/<disk>/<part>/stat files display the
|
||||
I/O statistics of partition <part>. The format is the
|
||||
same as the above-written /sys/block/<disk>/stat
|
||||
format.
|
|
@ -0,0 +1,99 @@
|
|||
What: /sys/firmware/acpi/interrupts/
|
||||
Date: February 2008
|
||||
Contact: Len Brown <lenb@kernel.org>
|
||||
Description:
|
||||
All ACPI interrupts are handled via a single IRQ,
|
||||
the System Control Interrupt (SCI), which appears
|
||||
as "acpi" in /proc/interrupts.
|
||||
|
||||
However, one of the main functions of ACPI is to make
|
||||
the platform understand random hardware without
|
||||
special driver support. So while the SCI handles a few
|
||||
well known (fixed feature) interrupts sources, such
|
||||
as the power button, it can also handle a variable
|
||||
number of a "General Purpose Events" (GPE).
|
||||
|
||||
A GPE vectors to a specified handler in AML, which
|
||||
can do a anything the BIOS writer wants from
|
||||
OS context. GPE 0x12, for example, would vector
|
||||
to a level or edge handler called _L12 or _E12.
|
||||
The handler may do its business and return.
|
||||
Or the handler may send send a Notify event
|
||||
to a Linux device driver registered on an ACPI device,
|
||||
such as a battery, or a processor.
|
||||
|
||||
To figure out where all the SCI's are coming from,
|
||||
/sys/firmware/acpi/interrupts contains a file listing
|
||||
every possible source, and the count of how many
|
||||
times it has triggered.
|
||||
|
||||
$ cd /sys/firmware/acpi/interrupts
|
||||
$ grep . *
|
||||
error:0
|
||||
ff_gbl_lock:0
|
||||
ff_pmtimer:0
|
||||
ff_pwr_btn:0
|
||||
ff_rt_clk:0
|
||||
ff_slp_btn:0
|
||||
gpe00:0
|
||||
gpe01:0
|
||||
gpe02:0
|
||||
gpe03:0
|
||||
gpe04:0
|
||||
gpe05:0
|
||||
gpe06:0
|
||||
gpe07:0
|
||||
gpe08:0
|
||||
gpe09:174
|
||||
gpe0A:0
|
||||
gpe0B:0
|
||||
gpe0C:0
|
||||
gpe0D:0
|
||||
gpe0E:0
|
||||
gpe0F:0
|
||||
gpe10:0
|
||||
gpe11:60
|
||||
gpe12:0
|
||||
gpe13:0
|
||||
gpe14:0
|
||||
gpe15:0
|
||||
gpe16:0
|
||||
gpe17:0
|
||||
gpe18:0
|
||||
gpe19:7
|
||||
gpe1A:0
|
||||
gpe1B:0
|
||||
gpe1C:0
|
||||
gpe1D:0
|
||||
gpe1E:0
|
||||
gpe1F:0
|
||||
gpe_all:241
|
||||
sci:241
|
||||
|
||||
sci - The total number of times the ACPI SCI
|
||||
has claimed an interrupt.
|
||||
|
||||
gpe_all - count of SCI caused by GPEs.
|
||||
|
||||
gpeXX - count for individual GPE source
|
||||
|
||||
ff_gbl_lock - Global Lock
|
||||
|
||||
ff_pmtimer - PM Timer
|
||||
|
||||
ff_pwr_btn - Power Button
|
||||
|
||||
ff_rt_clk - Real Time Clock
|
||||
|
||||
ff_slp_btn - Sleep Button
|
||||
|
||||
error - an interrupt that can't be accounted for above.
|
||||
|
||||
Root has permission to clear any of these counters. Eg.
|
||||
# echo 0 > gpe11
|
||||
|
||||
All counters can be cleared by clearing the total "sci":
|
||||
# echo 0 > sci
|
||||
|
||||
None of these counters has an effect on the function
|
||||
of the system, they are simply statistics.
|
|
@ -11,4 +11,4 @@ Description:
|
|||
example would be, if User A has shares = 1024 and user
|
||||
B has shares = 2048, User B will get twice the CPU
|
||||
bandwidth user A will. For more details refer
|
||||
Documentation/sched-design-CFS.txt
|
||||
Documentation/scheduler/sched-design-CFS.txt
|
||||
|
|
|
@ -214,6 +214,23 @@ And recompile the kernel with CONFIG_DEBUG_INFO enabled:
|
|||
gdb vmlinux
|
||||
(gdb) p vt_ioctl
|
||||
(gdb) l *(0x<address of vt_ioctl> + 0xda8)
|
||||
or, as one command
|
||||
(gdb) l *(vt_ioctl + 0xda8)
|
||||
|
||||
If you have a call trace, such as :-
|
||||
>Call Trace:
|
||||
> [<ffffffff8802c8e9>] :jbd:log_wait_commit+0xa3/0xf5
|
||||
> [<ffffffff810482d9>] autoremove_wake_function+0x0/0x2e
|
||||
> [<ffffffff8802770b>] :jbd:journal_stop+0x1be/0x1ee
|
||||
> ...
|
||||
this shows the problem in the :jbd: module. You can load that module in gdb
|
||||
and list the relevant code.
|
||||
gdb fs/jbd/jbd.ko
|
||||
(gdb) p log_wait_commit
|
||||
(gdb) l *(0x<address> + 0xa3)
|
||||
or
|
||||
(gdb) l *(log_wait_commit + 0xa3)
|
||||
|
||||
|
||||
Another very useful option of the Kernel Hacking section in menuconfig is
|
||||
Debug memory allocations. This will help you see whether data has been
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
|
||||
kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
|
||||
procfs-guide.xml writing_usb_driver.xml \
|
||||
procfs-guide.xml writing_usb_driver.xml networking.xml \
|
||||
kernel-api.xml filesystems.xml lsm.xml usb.xml \
|
||||
gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
|
||||
genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml
|
||||
|
|
|
@ -398,4 +398,24 @@ an example.
|
|||
|
||||
</chapter>
|
||||
|
||||
<chapter id="splice">
|
||||
<title>splice API</title>
|
||||
<para>
|
||||
splice is a method for moving blocks of data around inside the
|
||||
kernel, without continually transferring them between the kernel
|
||||
and user space.
|
||||
</para>
|
||||
!Ffs/splice.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="pipes">
|
||||
<title>pipes API</title>
|
||||
<para>
|
||||
Pipe interfaces are all for in-kernel (builtin image) use.
|
||||
They are not exported for use by modules.
|
||||
</para>
|
||||
!Iinclude/linux/pipe_fs_i.h
|
||||
!Ffs/pipe.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
||||
|
|
|
@ -172,7 +172,7 @@
|
|||
<listitem><para>Chiplevel hardware encapsulation</para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Interrupt_control_flow">
|
||||
<title>Interrupt control flow</title>
|
||||
<para>
|
||||
Each interrupt is described by an interrupt descriptor structure
|
||||
|
@ -190,7 +190,7 @@
|
|||
referenced by the assigned chip descriptor structure.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Highlevel_Driver_API">
|
||||
<title>Highlevel Driver API</title>
|
||||
<para>
|
||||
The highlevel Driver API consists of following functions:
|
||||
|
@ -210,7 +210,7 @@
|
|||
See the autogenerated function documentation for details.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Highlevel_IRQ_flow_handlers">
|
||||
<title>Highlevel IRQ flow handlers</title>
|
||||
<para>
|
||||
The generic layer provides a set of pre-defined irq-flow methods:
|
||||
|
@ -224,9 +224,9 @@
|
|||
specific) are assigned to specific interrupts by the architecture
|
||||
either during bootup or during device initialization.
|
||||
</para>
|
||||
<sect2>
|
||||
<sect2 id="Default_flow_implementations">
|
||||
<title>Default flow implementations</title>
|
||||
<sect3>
|
||||
<sect3 id="Helper_functions">
|
||||
<title>Helper functions</title>
|
||||
<para>
|
||||
The helper functions call the chip primitives and
|
||||
|
@ -267,9 +267,9 @@ noop(irq)
|
|||
</para>
|
||||
</sect3>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Default_flow_handler_implementations">
|
||||
<title>Default flow handler implementations</title>
|
||||
<sect3>
|
||||
<sect3 id="Default_Level_IRQ_flow_handler">
|
||||
<title>Default Level IRQ flow handler</title>
|
||||
<para>
|
||||
handle_level_irq provides a generic implementation
|
||||
|
@ -284,7 +284,7 @@ desc->chip->end();
|
|||
</programlisting>
|
||||
</para>
|
||||
</sect3>
|
||||
<sect3>
|
||||
<sect3 id="Default_Edge_IRQ_flow_handler">
|
||||
<title>Default Edge IRQ flow handler</title>
|
||||
<para>
|
||||
handle_edge_irq provides a generic implementation
|
||||
|
@ -311,7 +311,7 @@ desc->chip->end();
|
|||
</programlisting>
|
||||
</para>
|
||||
</sect3>
|
||||
<sect3>
|
||||
<sect3 id="Default_simple_IRQ_flow_handler">
|
||||
<title>Default simple IRQ flow handler</title>
|
||||
<para>
|
||||
handle_simple_irq provides a generic implementation
|
||||
|
@ -328,7 +328,7 @@ handle_IRQ_event(desc->action);
|
|||
</programlisting>
|
||||
</para>
|
||||
</sect3>
|
||||
<sect3>
|
||||
<sect3 id="Default_per_CPU_flow_handler">
|
||||
<title>Default per CPU flow handler</title>
|
||||
<para>
|
||||
handle_percpu_irq provides a generic implementation
|
||||
|
@ -349,7 +349,7 @@ desc->chip->end();
|
|||
</para>
|
||||
</sect3>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Quirks_and_optimizations">
|
||||
<title>Quirks and optimizations</title>
|
||||
<para>
|
||||
The generic functions are intended for 'clean' architectures and chips,
|
||||
|
@ -358,7 +358,7 @@ desc->chip->end();
|
|||
overriding the highlevel irq-flow handler.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Delayed_interrupt_disable">
|
||||
<title>Delayed interrupt disable</title>
|
||||
<para>
|
||||
This per interrupt selectable feature, which was introduced by Russell
|
||||
|
@ -380,7 +380,7 @@ desc->chip->end();
|
|||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Chiplevel_hardware_encapsulation">
|
||||
<title>Chiplevel hardware encapsulation</title>
|
||||
<para>
|
||||
The chip level hardware descriptor structure irq_chip
|
||||
|
|
|
@ -165,6 +165,7 @@ X!Ilib/string.c
|
|||
!Emm/vmalloc.c
|
||||
!Imm/page_alloc.c
|
||||
!Emm/mempool.c
|
||||
!Emm/dmapool.c
|
||||
!Emm/page-writeback.c
|
||||
!Emm/truncate.c
|
||||
</sect1>
|
||||
|
@ -203,65 +204,6 @@ X!Ilib/string.c
|
|||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="netcore">
|
||||
<title>Linux Networking</title>
|
||||
<sect1><title>Networking Base Types</title>
|
||||
!Iinclude/linux/net.h
|
||||
</sect1>
|
||||
<sect1><title>Socket Buffer Functions</title>
|
||||
!Iinclude/linux/skbuff.h
|
||||
!Iinclude/net/sock.h
|
||||
!Enet/socket.c
|
||||
!Enet/core/skbuff.c
|
||||
!Enet/core/sock.c
|
||||
!Enet/core/datagram.c
|
||||
!Enet/core/stream.c
|
||||
</sect1>
|
||||
<sect1><title>Socket Filter</title>
|
||||
!Enet/core/filter.c
|
||||
</sect1>
|
||||
<sect1><title>Generic Network Statistics</title>
|
||||
!Iinclude/linux/gen_stats.h
|
||||
!Enet/core/gen_stats.c
|
||||
!Enet/core/gen_estimator.c
|
||||
</sect1>
|
||||
<sect1><title>SUN RPC subsystem</title>
|
||||
<!-- The !D functionality is not perfect, garbage has to be protected by comments
|
||||
!Dnet/sunrpc/sunrpc_syms.c
|
||||
-->
|
||||
!Enet/sunrpc/xdr.c
|
||||
!Enet/sunrpc/svcsock.c
|
||||
!Enet/sunrpc/sched.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="netdev">
|
||||
<title>Network device support</title>
|
||||
<sect1><title>Driver Support</title>
|
||||
!Enet/core/dev.c
|
||||
!Enet/ethernet/eth.c
|
||||
!Enet/sched/sch_generic.c
|
||||
!Iinclude/linux/etherdevice.h
|
||||
!Iinclude/linux/netdevice.h
|
||||
</sect1>
|
||||
<sect1><title>PHY Support</title>
|
||||
!Edrivers/net/phy/phy.c
|
||||
!Idrivers/net/phy/phy.c
|
||||
!Edrivers/net/phy/phy_device.c
|
||||
!Idrivers/net/phy/phy_device.c
|
||||
!Edrivers/net/phy/mdio_bus.c
|
||||
!Idrivers/net/phy/mdio_bus.c
|
||||
</sect1>
|
||||
<!-- FIXME: Removed for now since no structured comments in source
|
||||
<sect1><title>Wireless</title>
|
||||
X!Enet/core/wireless.c
|
||||
</sect1>
|
||||
-->
|
||||
<sect1><title>Synchronous PPP</title>
|
||||
!Edrivers/net/wan/syncppp.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="modload">
|
||||
<title>Module Support</title>
|
||||
<sect1><title>Module Loading</title>
|
||||
|
@ -371,7 +313,6 @@ X!Iinclude/linux/device.h
|
|||
!Edrivers/base/class.c
|
||||
!Edrivers/base/firmware_class.c
|
||||
!Edrivers/base/transport_class.c
|
||||
!Edrivers/base/dmapool.c
|
||||
<!-- Cannot be included, because
|
||||
attribute_container_add_class_device_adapter
|
||||
and attribute_container_classdev_to_container
|
||||
|
@ -508,11 +449,6 @@ X!Isound/sound_firmware.c
|
|||
!Edrivers/serial/8250.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="z85230">
|
||||
<title>Z85230 Support Library</title>
|
||||
!Edrivers/net/wan/z85230.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="fbdev">
|
||||
<title>Frame Buffer Library</title>
|
||||
|
||||
|
@ -712,24 +648,4 @@ X!Idrivers/video/console/fonts.c
|
|||
!Edrivers/i2c/i2c-core.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="splice">
|
||||
<title>splice API</title>
|
||||
<para>
|
||||
splice is a method for moving blocks of data around inside the
|
||||
kernel, without continually transferring them between the kernel
|
||||
and user space.
|
||||
</para>
|
||||
!Ffs/splice.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="pipes">
|
||||
<title>pipes API</title>
|
||||
<para>
|
||||
Pipe interfaces are all for in-kernel (builtin image) use.
|
||||
They are not exported for use by modules.
|
||||
</para>
|
||||
!Iinclude/linux/pipe_fs_i.h
|
||||
!Ffs/pipe.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
||||
|
|
|
@ -717,7 +717,7 @@ used, and when it gets full, throws out the least used one.
|
|||
<para>
|
||||
For our first example, we assume that all operations are in user
|
||||
context (ie. from system calls), so we can sleep. This means we can
|
||||
use a semaphore to protect the cache and all the objects within
|
||||
use a mutex to protect the cache and all the objects within
|
||||
it. Here's the code:
|
||||
</para>
|
||||
|
||||
|
@ -725,7 +725,7 @@ it. Here's the code:
|
|||
#include <linux/list.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <asm/semaphore.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <asm/errno.h>
|
||||
|
||||
struct object
|
||||
|
@ -737,7 +737,7 @@ struct object
|
|||
};
|
||||
|
||||
/* Protects the cache, cache_num, and the objects within it */
|
||||
static DECLARE_MUTEX(cache_lock);
|
||||
static DEFINE_MUTEX(cache_lock);
|
||||
static LIST_HEAD(cache);
|
||||
static unsigned int cache_num = 0;
|
||||
#define MAX_CACHE_SIZE 10
|
||||
|
@ -789,17 +789,17 @@ int cache_add(int id, const char *name)
|
|||
obj->id = id;
|
||||
obj->popularity = 0;
|
||||
|
||||
down(&cache_lock);
|
||||
mutex_lock(&cache_lock);
|
||||
__cache_add(obj);
|
||||
up(&cache_lock);
|
||||
mutex_unlock(&cache_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cache_delete(int id)
|
||||
{
|
||||
down(&cache_lock);
|
||||
mutex_lock(&cache_lock);
|
||||
__cache_delete(__cache_find(id));
|
||||
up(&cache_lock);
|
||||
mutex_unlock(&cache_lock);
|
||||
}
|
||||
|
||||
int cache_find(int id, char *name)
|
||||
|
@ -807,13 +807,13 @@ int cache_find(int id, char *name)
|
|||
struct object *obj;
|
||||
int ret = -ENOENT;
|
||||
|
||||
down(&cache_lock);
|
||||
mutex_lock(&cache_lock);
|
||||
obj = __cache_find(id);
|
||||
if (obj) {
|
||||
ret = 0;
|
||||
strcpy(name, obj->name);
|
||||
}
|
||||
up(&cache_lock);
|
||||
mutex_unlock(&cache_lock);
|
||||
return ret;
|
||||
}
|
||||
</programlisting>
|
||||
|
@ -853,7 +853,7 @@ The change is shown below, in standard patch format: the
|
|||
int popularity;
|
||||
};
|
||||
|
||||
-static DECLARE_MUTEX(cache_lock);
|
||||
-static DEFINE_MUTEX(cache_lock);
|
||||
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
|
||||
static LIST_HEAD(cache);
|
||||
static unsigned int cache_num = 0;
|
||||
|
@ -870,22 +870,22 @@ The change is shown below, in standard patch format: the
|
|||
obj->id = id;
|
||||
obj->popularity = 0;
|
||||
|
||||
- down(&cache_lock);
|
||||
- mutex_lock(&cache_lock);
|
||||
+ spin_lock_irqsave(&cache_lock, flags);
|
||||
__cache_add(obj);
|
||||
- up(&cache_lock);
|
||||
- mutex_unlock(&cache_lock);
|
||||
+ spin_unlock_irqrestore(&cache_lock, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cache_delete(int id)
|
||||
{
|
||||
- down(&cache_lock);
|
||||
- mutex_lock(&cache_lock);
|
||||
+ unsigned long flags;
|
||||
+
|
||||
+ spin_lock_irqsave(&cache_lock, flags);
|
||||
__cache_delete(__cache_find(id));
|
||||
- up(&cache_lock);
|
||||
- mutex_unlock(&cache_lock);
|
||||
+ spin_unlock_irqrestore(&cache_lock, flags);
|
||||
}
|
||||
|
||||
|
@ -895,14 +895,14 @@ The change is shown below, in standard patch format: the
|
|||
int ret = -ENOENT;
|
||||
+ unsigned long flags;
|
||||
|
||||
- down(&cache_lock);
|
||||
- mutex_lock(&cache_lock);
|
||||
+ spin_lock_irqsave(&cache_lock, flags);
|
||||
obj = __cache_find(id);
|
||||
if (obj) {
|
||||
ret = 0;
|
||||
strcpy(name, obj->name);
|
||||
}
|
||||
- up(&cache_lock);
|
||||
- mutex_unlock(&cache_lock);
|
||||
+ spin_unlock_irqrestore(&cache_lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
</authorgroup>
|
||||
</articleinfo>
|
||||
|
||||
<sect1><title>Introduction</title>
|
||||
<sect1 id="Introduction"><title>Introduction</title>
|
||||
|
||||
<para>
|
||||
In March 2001, the National Security Agency (NSA) gave a presentation
|
||||
|
|
|
@ -80,7 +80,7 @@
|
|||
struct member has a short description which is marked with an [XXX] identifier.
|
||||
The following chapters explain the meaning of those identifiers.
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Function_identifiers_XXX">
|
||||
<title>Function identifiers [XXX]</title>
|
||||
<para>
|
||||
The functions are marked with [XXX] identifiers in the short
|
||||
|
@ -115,7 +115,7 @@
|
|||
</para></listitem>
|
||||
</itemizedlist>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Struct_member_identifiers_XXX">
|
||||
<title>Struct member identifiers [XXX]</title>
|
||||
<para>
|
||||
The struct members are marked with [XXX] identifiers in the
|
||||
|
@ -159,7 +159,7 @@
|
|||
basic functions and fill out some really board dependent
|
||||
members in the nand chip description structure.
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Basic_defines">
|
||||
<title>Basic defines</title>
|
||||
<para>
|
||||
At least you have to provide a mtd structure and
|
||||
|
@ -185,7 +185,7 @@ static struct nand_chip board_chip;
|
|||
static unsigned long baseaddr;
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Partition_defines">
|
||||
<title>Partition defines</title>
|
||||
<para>
|
||||
If you want to divide your device into partitions, then
|
||||
|
@ -204,7 +204,7 @@ static struct mtd_partition partition_info[] = {
|
|||
};
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Hardware_control_functions">
|
||||
<title>Hardware control function</title>
|
||||
<para>
|
||||
The hardware control function provides access to the
|
||||
|
@ -246,7 +246,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd)
|
|||
}
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Device_ready_function">
|
||||
<title>Device ready function</title>
|
||||
<para>
|
||||
If the hardware interface has the ready busy pin of the NAND chip connected to a
|
||||
|
@ -257,7 +257,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd)
|
|||
the function must not be defined and the function pointer this->dev_ready is set to NULL.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Init_function">
|
||||
<title>Init function</title>
|
||||
<para>
|
||||
The init function allocates memory and sets up all the board
|
||||
|
@ -325,7 +325,7 @@ out:
|
|||
module_init(board_init);
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Exit_function">
|
||||
<title>Exit function</title>
|
||||
<para>
|
||||
The exit function is only neccecary if the driver is
|
||||
|
@ -359,7 +359,7 @@ module_exit(board_cleanup);
|
|||
driver. For a list of functions which can be overridden by the board
|
||||
driver see the documentation of the nand_chip structure.
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Multiple_chip_control">
|
||||
<title>Multiple chip control</title>
|
||||
<para>
|
||||
The nand driver can control chip arrays. Therefor the
|
||||
|
@ -419,9 +419,9 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
}
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Hardware_ECC_support">
|
||||
<title>Hardware ECC support</title>
|
||||
<sect2>
|
||||
<sect2 id="Functions_and_constants">
|
||||
<title>Functions and constants</title>
|
||||
<para>
|
||||
The nand driver supports three different types of
|
||||
|
@ -475,7 +475,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Hardware_ECC_with_syndrome_calculation">
|
||||
<title>Hardware ECC with syndrome calculation</title>
|
||||
<para>
|
||||
Many hardware ECC implementations provide Reed-Solomon
|
||||
|
@ -500,7 +500,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Bad_Block_table_support">
|
||||
<title>Bad block table support</title>
|
||||
<para>
|
||||
Most NAND chips mark the bad blocks at a defined
|
||||
|
@ -552,7 +552,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
allows faster access than always checking the
|
||||
bad block information on the flash chip itself.
|
||||
</para>
|
||||
<sect2>
|
||||
<sect2 id="Flash_based_tables">
|
||||
<title>Flash based tables</title>
|
||||
<para>
|
||||
It may be desired or neccecary to keep a bad block table in FLASH.
|
||||
|
@ -587,7 +587,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="User_defined_tables">
|
||||
<title>User defined tables</title>
|
||||
<para>
|
||||
User defined tables are created by filling out a
|
||||
|
@ -676,7 +676,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
|||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Spare_area_placement">
|
||||
<title>Spare area (auto)placement</title>
|
||||
<para>
|
||||
The nand driver implements different possibilities for
|
||||
|
@ -730,7 +730,7 @@ struct nand_oobinfo {
|
|||
</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<sect2>
|
||||
<sect2 id="Placement_defined_by_fs_driver">
|
||||
<title>Placement defined by fs driver</title>
|
||||
<para>
|
||||
The calling function provides a pointer to a nand_oobinfo
|
||||
|
@ -760,7 +760,7 @@ struct nand_oobinfo {
|
|||
done according to the given scheme in the nand_oobinfo structure.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Automatic_placement">
|
||||
<title>Automatic placement</title>
|
||||
<para>
|
||||
Automatic placement uses the built in defaults to place the
|
||||
|
@ -774,7 +774,7 @@ struct nand_oobinfo {
|
|||
done according to the default builtin scheme.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="User_space_placement_selection">
|
||||
<title>User space placement selection</title>
|
||||
<para>
|
||||
All non ecc functions like mtd->read and mtd->write use an internal
|
||||
|
@ -789,9 +789,9 @@ struct nand_oobinfo {
|
|||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Spare_area_autoplacement_default">
|
||||
<title>Spare area autoplacement default schemes</title>
|
||||
<sect2>
|
||||
<sect2 id="pagesize_256">
|
||||
<title>256 byte pagesize</title>
|
||||
<informaltable><tgroup cols="3"><tbody>
|
||||
<row>
|
||||
|
@ -843,7 +843,7 @@ pages this byte is reserved</entry>
|
|||
</row>
|
||||
</tbody></tgroup></informaltable>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="pagesize_512">
|
||||
<title>512 byte pagesize</title>
|
||||
<informaltable><tgroup cols="3"><tbody>
|
||||
<row>
|
||||
|
@ -906,7 +906,7 @@ in this page</entry>
|
|||
</row>
|
||||
</tbody></tgroup></informaltable>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="pagesize_2048">
|
||||
<title>2048 byte pagesize</title>
|
||||
<informaltable><tgroup cols="3"><tbody>
|
||||
<row>
|
||||
|
@ -1126,9 +1126,9 @@ in this page</entry>
|
|||
<para>
|
||||
This chapter describes the constants which might be relevant for a driver developer.
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Chip_option_constants">
|
||||
<title>Chip option constants</title>
|
||||
<sect2>
|
||||
<sect2 id="Constants_for_chip_id_table">
|
||||
<title>Constants for chip id table</title>
|
||||
<para>
|
||||
These constants are defined in nand.h. They are ored together to describe
|
||||
|
@ -1153,7 +1153,7 @@ in this page</entry>
|
|||
</programlisting>
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<sect2 id="Constants_for_runtime_options">
|
||||
<title>Constants for runtime options</title>
|
||||
<para>
|
||||
These constants are defined in nand.h. They are ored together to describe
|
||||
|
@ -1171,7 +1171,7 @@ in this page</entry>
|
|||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1>
|
||||
<sect1 id="EEC_selection_constants">
|
||||
<title>ECC selection constants</title>
|
||||
<para>
|
||||
Use these constants to select the ECC algorithm.
|
||||
|
@ -1192,7 +1192,7 @@ in this page</entry>
|
|||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Hardware_control_related_constants">
|
||||
<title>Hardware control related constants</title>
|
||||
<para>
|
||||
These constants describe the requested hardware access function when
|
||||
|
@ -1218,7 +1218,7 @@ in this page</entry>
|
|||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Bad_block_table_constants">
|
||||
<title>Bad block table related constants</title>
|
||||
<para>
|
||||
These constants describe the options used for bad block
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="LinuxNetworking">
|
||||
<bookinfo>
|
||||
<title>Linux Networking and Network Devices APIs</title>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2 of the License, or (at your option) any later
|
||||
version.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="netcore">
|
||||
<title>Linux Networking</title>
|
||||
<sect1><title>Networking Base Types</title>
|
||||
!Iinclude/linux/net.h
|
||||
</sect1>
|
||||
<sect1><title>Socket Buffer Functions</title>
|
||||
!Iinclude/linux/skbuff.h
|
||||
!Iinclude/net/sock.h
|
||||
!Enet/socket.c
|
||||
!Enet/core/skbuff.c
|
||||
!Enet/core/sock.c
|
||||
!Enet/core/datagram.c
|
||||
!Enet/core/stream.c
|
||||
</sect1>
|
||||
<sect1><title>Socket Filter</title>
|
||||
!Enet/core/filter.c
|
||||
</sect1>
|
||||
<sect1><title>Generic Network Statistics</title>
|
||||
!Iinclude/linux/gen_stats.h
|
||||
!Enet/core/gen_stats.c
|
||||
!Enet/core/gen_estimator.c
|
||||
</sect1>
|
||||
<sect1><title>SUN RPC subsystem</title>
|
||||
<!-- The !D functionality is not perfect, garbage has to be protected by comments
|
||||
!Dnet/sunrpc/sunrpc_syms.c
|
||||
-->
|
||||
!Enet/sunrpc/xdr.c
|
||||
!Enet/sunrpc/svc_xprt.c
|
||||
!Enet/sunrpc/xprt.c
|
||||
!Enet/sunrpc/sched.c
|
||||
!Enet/sunrpc/socklib.c
|
||||
!Enet/sunrpc/stats.c
|
||||
!Enet/sunrpc/rpc_pipe.c
|
||||
!Enet/sunrpc/rpcb_clnt.c
|
||||
!Enet/sunrpc/clnt.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="netdev">
|
||||
<title>Network device support</title>
|
||||
<sect1><title>Driver Support</title>
|
||||
!Enet/core/dev.c
|
||||
!Enet/ethernet/eth.c
|
||||
!Enet/sched/sch_generic.c
|
||||
!Iinclude/linux/etherdevice.h
|
||||
!Iinclude/linux/netdevice.h
|
||||
</sect1>
|
||||
<sect1><title>PHY Support</title>
|
||||
!Edrivers/net/phy/phy.c
|
||||
!Idrivers/net/phy/phy.c
|
||||
!Edrivers/net/phy/phy_device.c
|
||||
!Idrivers/net/phy/phy_device.c
|
||||
!Edrivers/net/phy/mdio_bus.c
|
||||
!Idrivers/net/phy/mdio_bus.c
|
||||
</sect1>
|
||||
<!-- FIXME: Removed for now since no structured comments in source
|
||||
<sect1><title>Wireless</title>
|
||||
X!Enet/core/wireless.c
|
||||
</sect1>
|
||||
-->
|
||||
<sect1><title>Synchronous PPP</title>
|
||||
!Edrivers/net/wan/syncppp.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
</book>
|
|
@ -85,7 +85,7 @@
|
|||
|
||||
|
||||
|
||||
<preface>
|
||||
<preface id="Preface">
|
||||
<title>Preface</title>
|
||||
|
||||
<para>
|
||||
|
@ -230,7 +230,7 @@
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Creating_a_symlink">
|
||||
<title>Creating a symlink</title>
|
||||
|
||||
<funcsynopsis>
|
||||
|
@ -254,7 +254,7 @@
|
|||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Creating_a_directory">
|
||||
<title>Creating a directory</title>
|
||||
|
||||
<funcsynopsis>
|
||||
|
@ -274,7 +274,7 @@
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Removing_an_entry">
|
||||
<title>Removing an entry</title>
|
||||
|
||||
<funcsynopsis>
|
||||
|
@ -340,7 +340,7 @@ entry->write_proc = write_proc_foo;
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Reading_data">
|
||||
<title>Reading data</title>
|
||||
|
||||
<para>
|
||||
|
@ -448,7 +448,7 @@ entry->write_proc = write_proc_foo;
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Writing_data">
|
||||
<title>Writing data</title>
|
||||
|
||||
<para>
|
||||
|
@ -579,7 +579,7 @@ int foo_read_func(char *page, char **start, off_t off,
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Modules">
|
||||
<title>Modules</title>
|
||||
|
||||
<para>
|
||||
|
@ -599,7 +599,7 @@ entry->owner = THIS_MODULE;
|
|||
|
||||
|
||||
|
||||
<sect1>
|
||||
<sect1 id="Mode_and_ownership">
|
||||
<title>Mode and ownership</title>
|
||||
|
||||
<para>
|
||||
|
|
|
@ -77,11 +77,11 @@
|
|||
<chapter id="bugs">
|
||||
<title>Known Bugs and Limitations</title>
|
||||
|
||||
<sect1>
|
||||
<sect1 id="known_bugs">
|
||||
<title>Bugs</title>
|
||||
<para>None. ;)</para>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<sect1 id="Limitations">
|
||||
<title>Limitations</title>
|
||||
<para>
|
||||
<orderedlist>
|
||||
|
@ -100,7 +100,7 @@
|
|||
on devices, request/map memory region resources,
|
||||
and manage mailboxes/doorbells.
|
||||
</para>
|
||||
<sect1>
|
||||
<sect1 id="Functions">
|
||||
<title>Functions</title>
|
||||
!Iinclude/linux/rio_drv.h
|
||||
!Edrivers/rapidio/rio-driver.c
|
||||
|
@ -116,23 +116,23 @@
|
|||
subsystem.
|
||||
</para>
|
||||
|
||||
<sect1><title>Structures</title>
|
||||
<sect1 id="Structures"><title>Structures</title>
|
||||
!Iinclude/linux/rio.h
|
||||
</sect1>
|
||||
<sect1><title>Enumeration and Discovery</title>
|
||||
<sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title>
|
||||
!Idrivers/rapidio/rio-scan.c
|
||||
</sect1>
|
||||
<sect1><title>Driver functionality</title>
|
||||
<sect1 id="Driver_functionality"><title>Driver functionality</title>
|
||||
!Idrivers/rapidio/rio.c
|
||||
!Idrivers/rapidio/rio-access.c
|
||||
</sect1>
|
||||
<sect1><title>Device model support</title>
|
||||
<sect1 id="Device_model_support"><title>Device model support</title>
|
||||
!Idrivers/rapidio/rio-driver.c
|
||||
</sect1>
|
||||
<sect1><title>Sysfs support</title>
|
||||
<sect1 id="Sysfs_support"><title>Sysfs support</title>
|
||||
!Idrivers/rapidio/rio-sysfs.c
|
||||
</sect1>
|
||||
<sect1><title>PPC32 support</title>
|
||||
<sect1 id="PPC32_support"><title>PPC32 support</title>
|
||||
!Iarch/powerpc/kernel/rio.c
|
||||
!Earch/powerpc/sysdev/fsl_rio.c
|
||||
!Iarch/powerpc/sysdev/fsl_rio.c
|
||||
|
|
|
@ -59,7 +59,7 @@
|
|||
<title>Introduction</title>
|
||||
<para>
|
||||
This document describes the interfaces available for device drivers that
|
||||
drive s390 based channel attached devices. This includes interfaces for
|
||||
drive s390 based channel attached I/O devices. This includes interfaces for
|
||||
interaction with the hardware and interfaces for interacting with the
|
||||
common driver core. Those interfaces are provided by the s390 common I/O
|
||||
layer.
|
||||
|
@ -86,9 +86,10 @@
|
|||
The ccw bus typically contains the majority of devices available to
|
||||
a s390 system. Named after the channel command word (ccw), the basic
|
||||
command structure used to address its devices, the ccw bus contains
|
||||
so-called channel attached devices. They are addressed via subchannels,
|
||||
visible on the css bus. A device driver, however, will never interact
|
||||
with the subchannel directly, but only via the device on the ccw bus,
|
||||
so-called channel attached devices. They are addressed via I/O
|
||||
subchannels, visible on the css bus. A device driver for
|
||||
channel-attached devices, however, will never interact with the
|
||||
subchannel directly, but only via the I/O device on the ccw bus,
|
||||
the ccw device.
|
||||
</para>
|
||||
<sect1 id="channelIO">
|
||||
|
@ -116,7 +117,6 @@
|
|||
!Iinclude/asm-s390/ccwdev.h
|
||||
!Edrivers/s390/cio/device.c
|
||||
!Edrivers/s390/cio/device_ops.c
|
||||
!Edrivers/s390/cio/airq.c
|
||||
</sect1>
|
||||
<sect1 id="cmf">
|
||||
<title>The channel-measurement facility</title>
|
||||
|
@ -147,4 +147,15 @@
|
|||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="genericinterfaces">
|
||||
<title>Generic interfaces</title>
|
||||
<para>
|
||||
Some interfaces are available to other drivers that do not necessarily
|
||||
have anything to do with the busses described above, but still are
|
||||
indirectly using basic infrastructure in the common I/O layer.
|
||||
One example is the support for adapter interrupts.
|
||||
</para>
|
||||
!Edrivers/s390/cio/airq.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
<surname>Bottomley</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>James.Bottomley@steeleye.com</email>
|
||||
<email>James.Bottomley@hansenpartnership.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
|
|
|
@ -170,7 +170,7 @@ int __init myradio_init(struct video_init *v)
|
|||
<para>
|
||||
The types available are
|
||||
</para>
|
||||
<table frame="all"><title>Device Types</title>
|
||||
<table frame="all" id="Device_Types"><title>Device Types</title>
|
||||
<tgroup cols="3" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -291,7 +291,7 @@ static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
allows the applications to find out what sort of a card they have found and
|
||||
to figure out what they want to do about it. The fields in the structure are
|
||||
</para>
|
||||
<table frame="all"><title>struct video_capability fields</title>
|
||||
<table frame="all" id="video_capability_fields"><title>struct video_capability fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -365,7 +365,7 @@ static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
<para>
|
||||
The video_tuner structure has the following fields
|
||||
</para>
|
||||
<table frame="all"><title>struct video_tuner fields</title>
|
||||
<table frame="all" id="video_tuner_fields"><title>struct video_tuner fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -398,7 +398,7 @@ static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
</tgroup>
|
||||
</table>
|
||||
|
||||
<table frame="all"><title>struct video_tuner flags</title>
|
||||
<table frame="all" id="video_tuner_flags"><title>struct video_tuner flags</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -421,7 +421,7 @@ static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
</tgroup>
|
||||
</table>
|
||||
|
||||
<table frame="all"><title>struct video_tuner modes</title>
|
||||
<table frame="all" id="video_tuner_modes"><title>struct video_tuner modes</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -572,7 +572,7 @@ static int current_volume=0;
|
|||
<para>
|
||||
Then we fill in the video_audio structure. This has the following format
|
||||
</para>
|
||||
<table frame="all"><title>struct video_audio fields</title>
|
||||
<table frame="all" id="video_audio_fields"><title>struct video_audio fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -607,7 +607,7 @@ static int current_volume=0;
|
|||
</tgroup>
|
||||
</table>
|
||||
|
||||
<table frame="all"><title>struct video_audio flags</title>
|
||||
<table frame="all" id="video_audio_flags"><title>struct video_audio flags</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -625,7 +625,7 @@ static int current_volume=0;
|
|||
</tgroup>
|
||||
</table>
|
||||
|
||||
<table frame="all"><title>struct video_audio modes</title>
|
||||
<table frame="all" id="video_audio_modes"><title>struct video_audio modes</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -775,7 +775,7 @@ module_exit(cleanup);
|
|||
</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter>
|
||||
<chapter id="Video_Capture_Devices">
|
||||
<title>Video Capture Devices</title>
|
||||
<sect1 id="introvid">
|
||||
<title>Video Capture Device Types</title>
|
||||
|
@ -855,7 +855,7 @@ static struct video_device my_camera
|
|||
We use the extra video capability flags that did not apply to the
|
||||
radio interface. The video related flags are
|
||||
</para>
|
||||
<table frame="all"><title>Capture Capabilities</title>
|
||||
<table frame="all" id="Capture_Capabilities"><title>Capture Capabilities</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1195,7 +1195,7 @@ static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
inputs to the video card). Our example card has a single camera input. The
|
||||
fields in the structure are
|
||||
</para>
|
||||
<table frame="all"><title>struct video_channel fields</title>
|
||||
<table frame="all" id="video_channel_fields"><title>struct video_channel fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1218,7 +1218,7 @@ static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
<table frame="all"><title>struct video_channel flags</title>
|
||||
<table frame="all" id="video_channel_flags"><title>struct video_channel flags</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1229,7 +1229,7 @@ static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
<table frame="all"><title>struct video_channel types</title>
|
||||
<table frame="all" id="video_channel_types"><title>struct video_channel types</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1242,7 +1242,7 @@ static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
<table frame="all"><title>struct video_channel norms</title>
|
||||
<table frame="all" id="video_channel_norms"><title>struct video_channel norms</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1328,7 +1328,7 @@ static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
|
|||
for every other pixel in the image. The other common formats the interface
|
||||
defines are
|
||||
</para>
|
||||
<table frame="all"><title>Framebuffer Encodings</title>
|
||||
<table frame="all" id="Framebuffer_Encodings"><title>Framebuffer Encodings</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1466,7 +1466,7 @@ static struct video_buffer capture_fb;
|
|||
display. The video_window structure is used to describe the way the image
|
||||
should be displayed.
|
||||
</para>
|
||||
<table frame="all"><title>struct video_window fields</title>
|
||||
<table frame="all" id="video_window_fields"><title>struct video_window fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
@ -1503,7 +1503,7 @@ static struct video_buffer capture_fb;
|
|||
<para>
|
||||
Each clip is a struct video_clip which has the following fields
|
||||
</para>
|
||||
<table frame="all"><title>video_clip fields</title>
|
||||
<table frame="all" id="video_clip_fields"><title>video_clip fields</title>
|
||||
<tgroup cols="2" align="left">
|
||||
<tbody>
|
||||
<row>
|
||||
|
|
|
@ -77,7 +77,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Driver_Modes">
|
||||
<title>Driver Modes</title>
|
||||
<para>
|
||||
The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
|
||||
|
@ -108,7 +108,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Using_the_Z85230_driver">
|
||||
<title>Using the Z85230 driver</title>
|
||||
<para>
|
||||
The Z85230 driver provides the back end interface to your board. To
|
||||
|
@ -174,7 +174,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Attaching_Network_Interfaces">
|
||||
<title>Attaching Network Interfaces</title>
|
||||
<para>
|
||||
If you wish to use the network interface facilities of the driver,
|
||||
|
@ -216,7 +216,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Configuring_And_Activating_The_Port">
|
||||
<title>Configuring And Activating The Port</title>
|
||||
<para>
|
||||
The Z85230 driver provides helper functions and tables to load the
|
||||
|
@ -300,7 +300,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Network_Layer_Functions">
|
||||
<title>Network Layer Functions</title>
|
||||
<para>
|
||||
The Z8530 layer provides functions to queue packets for
|
||||
|
@ -327,7 +327,7 @@
|
|||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter>
|
||||
<chapter id="Porting_The_Z8530_Driver">
|
||||
<title>Porting The Z8530 Driver</title>
|
||||
<para>
|
||||
The Z8530 driver is written to be portable. In DMA mode it makes
|
||||
|
|
|
@ -25,7 +25,7 @@ the NMI handler to take the default machine-specific action.
|
|||
This nmi_callback variable is a global function pointer to the current
|
||||
NMI handler.
|
||||
|
||||
fastcall void do_nmi(struct pt_regs * regs, long error_code)
|
||||
void do_nmi(struct pt_regs * regs, long error_code)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
|
|
|
@ -0,0 +1,493 @@
|
|||
|
||||
|
||||
"Good for you, you've decided to clean the elevator!"
|
||||
- The Elevator, from Dark Star
|
||||
|
||||
Smack is the the Simplified Mandatory Access Control Kernel.
|
||||
Smack is a kernel based implementation of mandatory access
|
||||
control that includes simplicity in its primary design goals.
|
||||
|
||||
Smack is not the only Mandatory Access Control scheme
|
||||
available for Linux. Those new to Mandatory Access Control
|
||||
are encouraged to compare Smack with the other mechanisms
|
||||
available to determine which is best suited to the problem
|
||||
at hand.
|
||||
|
||||
Smack consists of three major components:
|
||||
- The kernel
|
||||
- A start-up script and a few modified applications
|
||||
- Configuration data
|
||||
|
||||
The kernel component of Smack is implemented as a Linux
|
||||
Security Modules (LSM) module. It requires netlabel and
|
||||
works best with file systems that support extended attributes,
|
||||
although xattr support is not strictly required.
|
||||
It is safe to run a Smack kernel under a "vanilla" distribution.
|
||||
Smack kernels use the CIPSO IP option. Some network
|
||||
configurations are intolerant of IP options and can impede
|
||||
access to systems that use them as Smack does.
|
||||
|
||||
The startup script etc-init.d-smack should be installed
|
||||
in /etc/init.d/smack and should be invoked early in the
|
||||
start-up process. On Fedora rc5.d/S02smack is recommended.
|
||||
This script ensures that certain devices have the correct
|
||||
Smack attributes and loads the Smack configuration if
|
||||
any is defined. This script invokes two programs that
|
||||
ensure configuration data is properly formatted. These
|
||||
programs are /usr/sbin/smackload and /usr/sin/smackcipso.
|
||||
The system will run just fine without these programs,
|
||||
but it will be difficult to set access rules properly.
|
||||
|
||||
A version of "ls" that provides a "-M" option to display
|
||||
Smack labels on long listing is available.
|
||||
|
||||
A hacked version of sshd that allows network logins by users
|
||||
with specific Smack labels is available. This version does
|
||||
not work for scp. You must set the /etc/ssh/sshd_config
|
||||
line:
|
||||
UsePrivilegeSeparation no
|
||||
|
||||
The format of /etc/smack/usr is:
|
||||
|
||||
username smack
|
||||
|
||||
In keeping with the intent of Smack, configuration data is
|
||||
minimal and not strictly required. The most important
|
||||
configuration step is mounting the smackfs pseudo filesystem.
|
||||
|
||||
Add this line to /etc/fstab:
|
||||
|
||||
smackfs /smack smackfs smackfsdef=* 0 0
|
||||
|
||||
and create the /smack directory for mounting.
|
||||
|
||||
Smack uses extended attributes (xattrs) to store file labels.
|
||||
The command to set a Smack label on a file is:
|
||||
|
||||
# attr -S -s SMACK64 -V "value" path
|
||||
|
||||
NOTE: Smack labels are limited to 23 characters. The attr command
|
||||
does not enforce this restriction and can be used to set
|
||||
invalid Smack labels on files.
|
||||
|
||||
If you don't do anything special all users will get the floor ("_")
|
||||
label when they log in. If you do want to log in via the hacked ssh
|
||||
at other labels use the attr command to set the smack value on the
|
||||
home directory and it's contents.
|
||||
|
||||
You can add access rules in /etc/smack/accesses. They take the form:
|
||||
|
||||
subjectlabel objectlabel access
|
||||
|
||||
access is a combination of the letters rwxa which specify the
|
||||
kind of access permitted a subject with subjectlabel on an
|
||||
object with objectlabel. If there is no rule no access is allowed.
|
||||
|
||||
A process can see the smack label it is running with by
|
||||
reading /proc/self/attr/current. A privileged process can
|
||||
set the process smack by writing there.
|
||||
|
||||
Look for additional programs on http://schaufler-ca.com
|
||||
|
||||
From the Smack Whitepaper:
|
||||
|
||||
The Simplified Mandatory Access Control Kernel
|
||||
|
||||
Casey Schaufler
|
||||
casey@schaufler-ca.com
|
||||
|
||||
Mandatory Access Control
|
||||
|
||||
Computer systems employ a variety of schemes to constrain how information is
|
||||
shared among the people and services using the machine. Some of these schemes
|
||||
allow the program or user to decide what other programs or users are allowed
|
||||
access to pieces of data. These schemes are called discretionary access
|
||||
control mechanisms because the access control is specified at the discretion
|
||||
of the user. Other schemes do not leave the decision regarding what a user or
|
||||
program can access up to users or programs. These schemes are called mandatory
|
||||
access control mechanisms because you don't have a choice regarding the users
|
||||
or programs that have access to pieces of data.
|
||||
|
||||
Bell & LaPadula
|
||||
|
||||
From the middle of the 1980's until the turn of the century Mandatory Access
|
||||
Control (MAC) was very closely associated with the Bell & LaPadula security
|
||||
model, a mathematical description of the United States Department of Defense
|
||||
policy for marking paper documents. MAC in this form enjoyed a following
|
||||
within the Capital Beltway and Scandinavian supercomputer centers but was
|
||||
often sited as failing to address general needs.
|
||||
|
||||
Domain Type Enforcement
|
||||
|
||||
Around the turn of the century Domain Type Enforcement (DTE) became popular.
|
||||
This scheme organizes users, programs, and data into domains that are
|
||||
protected from each other. This scheme has been widely deployed as a component
|
||||
of popular Linux distributions. The administrative overhead required to
|
||||
maintain this scheme and the detailed understanding of the whole system
|
||||
necessary to provide a secure domain mapping leads to the scheme being
|
||||
disabled or used in limited ways in the majority of cases.
|
||||
|
||||
Smack
|
||||
|
||||
Smack is a Mandatory Access Control mechanism designed to provide useful MAC
|
||||
while avoiding the pitfalls of its predecessors. The limitations of Bell &
|
||||
LaPadula are addressed by providing a scheme whereby access can be controlled
|
||||
according to the requirements of the system and its purpose rather than those
|
||||
imposed by an arcane government policy. The complexity of Domain Type
|
||||
Enforcement and avoided by defining access controls in terms of the access
|
||||
modes already in use.
|
||||
|
||||
Smack Terminology
|
||||
|
||||
The jargon used to talk about Smack will be familiar to those who have dealt
|
||||
with other MAC systems and shouldn't be too difficult for the uninitiated to
|
||||
pick up. There are four terms that are used in a specific way and that are
|
||||
especially important:
|
||||
|
||||
Subject: A subject is an active entity on the computer system.
|
||||
On Smack a subject is a task, which is in turn the basic unit
|
||||
of execution.
|
||||
|
||||
Object: An object is a passive entity on the computer system.
|
||||
On Smack files of all types, IPC, and tasks can be objects.
|
||||
|
||||
Access: Any attempt by a subject to put information into or get
|
||||
information from an object is an access.
|
||||
|
||||
Label: Data that identifies the Mandatory Access Control
|
||||
characteristics of a subject or an object.
|
||||
|
||||
These definitions are consistent with the traditional use in the security
|
||||
community. There are also some terms from Linux that are likely to crop up:
|
||||
|
||||
Capability: A task that possesses a capability has permission to
|
||||
violate an aspect of the system security policy, as identified by
|
||||
the specific capability. A task that possesses one or more
|
||||
capabilities is a privileged task, whereas a task with no
|
||||
capabilities is an unprivileged task.
|
||||
|
||||
Privilege: A task that is allowed to violate the system security
|
||||
policy is said to have privilege. As of this writing a task can
|
||||
have privilege either by possessing capabilities or by having an
|
||||
effective user of root.
|
||||
|
||||
Smack Basics
|
||||
|
||||
Smack is an extension to a Linux system. It enforces additional restrictions
|
||||
on what subjects can access which objects, based on the labels attached to
|
||||
each of the subject and the object.
|
||||
|
||||
Labels
|
||||
|
||||
Smack labels are ASCII character strings, one to twenty-three characters in
|
||||
length. Single character labels using special characters, that being anything
|
||||
other than a letter or digit, are reserved for use by the Smack development
|
||||
team. Smack labels are unstructured, case sensitive, and the only operation
|
||||
ever performed on them is comparison for equality. Smack labels cannot
|
||||
contain unprintable characters or the "/" (slash) character.
|
||||
|
||||
There are some predefined labels:
|
||||
|
||||
_ Pronounced "floor", a single underscore character.
|
||||
^ Pronounced "hat", a single circumflex character.
|
||||
* Pronounced "star", a single asterisk character.
|
||||
? Pronounced "huh", a single question mark character.
|
||||
|
||||
Every task on a Smack system is assigned a label. System tasks, such as
|
||||
init(8) and systems daemons, are run with the floor ("_") label. User tasks
|
||||
are assigned labels according to the specification found in the
|
||||
/etc/smack/user configuration file.
|
||||
|
||||
Access Rules
|
||||
|
||||
Smack uses the traditional access modes of Linux. These modes are read,
|
||||
execute, write, and occasionally append. There are a few cases where the
|
||||
access mode may not be obvious. These include:
|
||||
|
||||
Signals: A signal is a write operation from the subject task to
|
||||
the object task.
|
||||
Internet Domain IPC: Transmission of a packet is considered a
|
||||
write operation from the source task to the destination task.
|
||||
|
||||
Smack restricts access based on the label attached to a subject and the label
|
||||
attached to the object it is trying to access. The rules enforced are, in
|
||||
order:
|
||||
|
||||
1. Any access requested by a task labeled "*" is denied.
|
||||
2. A read or execute access requested by a task labeled "^"
|
||||
is permitted.
|
||||
3. A read or execute access requested on an object labeled "_"
|
||||
is permitted.
|
||||
4. Any access requested on an object labeled "*" is permitted.
|
||||
5. Any access requested by a task on an object with the same
|
||||
label is permitted.
|
||||
6. Any access requested that is explicitly defined in the loaded
|
||||
rule set is permitted.
|
||||
7. Any other access is denied.
|
||||
|
||||
Smack Access Rules
|
||||
|
||||
With the isolation provided by Smack access separation is simple. There are
|
||||
many interesting cases where limited access by subjects to objects with
|
||||
different labels is desired. One example is the familiar spy model of
|
||||
sensitivity, where a scientist working on a highly classified project would be
|
||||
able to read documents of lower classifications and anything she writes will
|
||||
be "born" highly classified. To accommodate such schemes Smack includes a
|
||||
mechanism for specifying rules allowing access between labels.
|
||||
|
||||
Access Rule Format
|
||||
|
||||
The format of an access rule is:
|
||||
|
||||
subject-label object-label access
|
||||
|
||||
Where subject-label is the Smack label of the task, object-label is the Smack
|
||||
label of the thing being accessed, and access is a string specifying the sort
|
||||
of access allowed. The Smack labels are limited to 23 characters. The access
|
||||
specification is searched for letters that describe access modes:
|
||||
|
||||
a: indicates that append access should be granted.
|
||||
r: indicates that read access should be granted.
|
||||
w: indicates that write access should be granted.
|
||||
x: indicates that execute access should be granted.
|
||||
|
||||
Uppercase values for the specification letters are allowed as well.
|
||||
Access mode specifications can be in any order. Examples of acceptable rules
|
||||
are:
|
||||
|
||||
TopSecret Secret rx
|
||||
Secret Unclass R
|
||||
Manager Game x
|
||||
User HR w
|
||||
New Old rRrRr
|
||||
Closed Off -
|
||||
|
||||
Examples of unacceptable rules are:
|
||||
|
||||
Top Secret Secret rx
|
||||
Ace Ace r
|
||||
Odd spells waxbeans
|
||||
|
||||
Spaces are not allowed in labels. Since a subject always has access to files
|
||||
with the same label specifying a rule for that case is pointless. Only
|
||||
valid letters (rwxaRWXA) and the dash ('-') character are allowed in
|
||||
access specifications. The dash is a placeholder, so "a-r" is the same
|
||||
as "ar". A lone dash is used to specify that no access should be allowed.
|
||||
|
||||
Applying Access Rules
|
||||
|
||||
The developers of Linux rarely define new sorts of things, usually importing
|
||||
schemes and concepts from other systems. Most often, the other systems are
|
||||
variants of Unix. Unix has many endearing properties, but consistency of
|
||||
access control models is not one of them. Smack strives to treat accesses as
|
||||
uniformly as is sensible while keeping with the spirit of the underlying
|
||||
mechanism.
|
||||
|
||||
File system objects including files, directories, named pipes, symbolic links,
|
||||
and devices require access permissions that closely match those used by mode
|
||||
bit access. To open a file for reading read access is required on the file. To
|
||||
search a directory requires execute access. Creating a file with write access
|
||||
requires both read and write access on the containing directory. Deleting a
|
||||
file requires read and write access to the file and to the containing
|
||||
directory. It is possible that a user may be able to see that a file exists
|
||||
but not any of its attributes by the circumstance of having read access to the
|
||||
containing directory but not to the differently labeled file. This is an
|
||||
artifact of the file name being data in the directory, not a part of the file.
|
||||
|
||||
IPC objects, message queues, semaphore sets, and memory segments exist in flat
|
||||
namespaces and access requests are only required to match the object in
|
||||
question.
|
||||
|
||||
Process objects reflect tasks on the system and the Smack label used to access
|
||||
them is the same Smack label that the task would use for its own access
|
||||
attempts. Sending a signal via the kill() system call is a write operation
|
||||
from the signaler to the recipient. Debugging a process requires both reading
|
||||
and writing. Creating a new task is an internal operation that results in two
|
||||
tasks with identical Smack labels and requires no access checks.
|
||||
|
||||
Sockets are data structures attached to processes and sending a packet from
|
||||
one process to another requires that the sender have write access to the
|
||||
receiver. The receiver is not required to have read access to the sender.
|
||||
|
||||
Setting Access Rules
|
||||
|
||||
The configuration file /etc/smack/accesses contains the rules to be set at
|
||||
system startup. The contents are written to the special file /smack/load.
|
||||
Rules can be written to /smack/load at any time and take effect immediately.
|
||||
For any pair of subject and object labels there can be only one rule, with the
|
||||
most recently specified overriding any earlier specification.
|
||||
|
||||
The program smackload is provided to ensure data is formatted
|
||||
properly when written to /smack/load. This program reads lines
|
||||
of the form
|
||||
|
||||
subjectlabel objectlabel mode.
|
||||
|
||||
Task Attribute
|
||||
|
||||
The Smack label of a process can be read from /proc/<pid>/attr/current. A
|
||||
process can read its own Smack label from /proc/self/attr/current. A
|
||||
privileged process can change its own Smack label by writing to
|
||||
/proc/self/attr/current but not the label of another process.
|
||||
|
||||
File Attribute
|
||||
|
||||
The Smack label of a filesystem object is stored as an extended attribute
|
||||
named SMACK64 on the file. This attribute is in the security namespace. It can
|
||||
only be changed by a process with privilege.
|
||||
|
||||
Privilege
|
||||
|
||||
A process with CAP_MAC_OVERRIDE is privileged.
|
||||
|
||||
Smack Networking
|
||||
|
||||
As mentioned before, Smack enforces access control on network protocol
|
||||
transmissions. Every packet sent by a Smack process is tagged with its Smack
|
||||
label. This is done by adding a CIPSO tag to the header of the IP packet. Each
|
||||
packet received is expected to have a CIPSO tag that identifies the label and
|
||||
if it lacks such a tag the network ambient label is assumed. Before the packet
|
||||
is delivered a check is made to determine that a subject with the label on the
|
||||
packet has write access to the receiving process and if that is not the case
|
||||
the packet is dropped.
|
||||
|
||||
CIPSO Configuration
|
||||
|
||||
It is normally unnecessary to specify the CIPSO configuration. The default
|
||||
values used by the system handle all internal cases. Smack will compose CIPSO
|
||||
label values to match the Smack labels being used without administrative
|
||||
intervention. Unlabeled packets that come into the system will be given the
|
||||
ambient label.
|
||||
|
||||
Smack requires configuration in the case where packets from a system that is
|
||||
not smack that speaks CIPSO may be encountered. Usually this will be a Trusted
|
||||
Solaris system, but there are other, less widely deployed systems out there.
|
||||
CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level,
|
||||
and a category set with each packet. The DOI is intended to identify a group
|
||||
of systems that use compatible labeling schemes, and the DOI specified on the
|
||||
smack system must match that of the remote system or packets will be
|
||||
discarded. The DOI is 3 by default. The value can be read from /smack/doi and
|
||||
can be changed by writing to /smack/doi.
|
||||
|
||||
The label and category set are mapped to a Smack label as defined in
|
||||
/etc/smack/cipso.
|
||||
|
||||
A Smack/CIPSO mapping has the form:
|
||||
|
||||
smack level [category [category]*]
|
||||
|
||||
Smack does not expect the level or category sets to be related in any
|
||||
particular way and does not assume or assign accesses based on them. Some
|
||||
examples of mappings:
|
||||
|
||||
TopSecret 7
|
||||
TS:A,B 7 1 2
|
||||
SecBDE 5 2 4 6
|
||||
RAFTERS 7 12 26
|
||||
|
||||
The ":" and "," characters are permitted in a Smack label but have no special
|
||||
meaning.
|
||||
|
||||
The mapping of Smack labels to CIPSO values is defined by writing to
|
||||
/smack/cipso. Again, the format of data written to this special file
|
||||
is highly restrictive, so the program smackcipso is provided to
|
||||
ensure the writes are done properly. This program takes mappings
|
||||
on the standard input and sends them to /smack/cipso properly.
|
||||
|
||||
In addition to explicit mappings Smack supports direct CIPSO mappings. One
|
||||
CIPSO level is used to indicate that the category set passed in the packet is
|
||||
in fact an encoding of the Smack label. The level used is 250 by default. The
|
||||
value can be read from /smack/direct and changed by writing to /smack/direct.
|
||||
|
||||
Socket Attributes
|
||||
|
||||
There are two attributes that are associated with sockets. These attributes
|
||||
can only be set by privileged tasks, but any task can read them for their own
|
||||
sockets.
|
||||
|
||||
SMACK64IPIN: The Smack label of the task object. A privileged
|
||||
program that will enforce policy may set this to the star label.
|
||||
|
||||
SMACK64IPOUT: The Smack label transmitted with outgoing packets.
|
||||
A privileged program may set this to match the label of another
|
||||
task with which it hopes to communicate.
|
||||
|
||||
Writing Applications for Smack
|
||||
|
||||
There are three sorts of applications that will run on a Smack system. How an
|
||||
application interacts with Smack will determine what it will have to do to
|
||||
work properly under Smack.
|
||||
|
||||
Smack Ignorant Applications
|
||||
|
||||
By far the majority of applications have no reason whatever to care about the
|
||||
unique properties of Smack. Since invoking a program has no impact on the
|
||||
Smack label associated with the process the only concern likely to arise is
|
||||
whether the process has execute access to the program.
|
||||
|
||||
Smack Relevant Applications
|
||||
|
||||
Some programs can be improved by teaching them about Smack, but do not make
|
||||
any security decisions themselves. The utility ls(1) is one example of such a
|
||||
program.
|
||||
|
||||
Smack Enforcing Applications
|
||||
|
||||
These are special programs that not only know about Smack, but participate in
|
||||
the enforcement of system policy. In most cases these are the programs that
|
||||
set up user sessions. There are also network services that provide information
|
||||
to processes running with various labels.
|
||||
|
||||
File System Interfaces
|
||||
|
||||
Smack maintains labels on file system objects using extended attributes. The
|
||||
Smack label of a file, directory, or other file system object can be obtained
|
||||
using getxattr(2).
|
||||
|
||||
len = getxattr("/", "security.SMACK64", value, sizeof (value));
|
||||
|
||||
will put the Smack label of the root directory into value. A privileged
|
||||
process can set the Smack label of a file system object with setxattr(2).
|
||||
|
||||
len = strlen("Rubble");
|
||||
rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
|
||||
|
||||
will set the Smack label of /foo to "Rubble" if the program has appropriate
|
||||
privilege.
|
||||
|
||||
Socket Interfaces
|
||||
|
||||
The socket attributes can be read using fgetxattr(2).
|
||||
|
||||
A privileged process can set the Smack label of outgoing packets with
|
||||
fsetxattr(2).
|
||||
|
||||
len = strlen("Rubble");
|
||||
rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
|
||||
|
||||
will set the Smack label "Rubble" on packets going out from the socket if the
|
||||
program has appropriate privilege.
|
||||
|
||||
rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
|
||||
|
||||
will set the Smack label "*" as the object label against which incoming
|
||||
packets will be checked if the program has appropriate privilege.
|
||||
|
||||
Administration
|
||||
|
||||
Smack supports some mount options:
|
||||
|
||||
smackfsdef=label: specifies the label to give files that lack
|
||||
the Smack label extended attribute.
|
||||
|
||||
smackfsroot=label: specifies the label to assign the root of the
|
||||
file system if it lacks the Smack extended attribute.
|
||||
|
||||
smackfshat=label: specifies a label that must have read access to
|
||||
all labels set on the filesystem. Not yet enforced.
|
||||
|
||||
smackfsfloor=label: specifies a label to which all labels set on the
|
||||
filesystem must have read access. Not yet enforced.
|
||||
|
||||
These mount options apply to all file system types.
|
||||
|
|
@ -20,7 +20,11 @@ kernel patches.
|
|||
4: ppc64 is a good architecture for cross-compilation checking because it
|
||||
tends to use `unsigned long' for 64-bit quantities.
|
||||
|
||||
5: Matches kernel coding style(!)
|
||||
5: Check your patch for general style as detailed in
|
||||
Documentation/CodingStyle. Check for trivial violations with the
|
||||
patch style checker prior to submission (scripts/checkpatch.pl).
|
||||
You should be able to justify all violations that remain in
|
||||
your patch.
|
||||
|
||||
6: Any new or modified CONFIG options don't muck up the config menu.
|
||||
|
||||
|
@ -79,13 +83,3 @@ kernel patches.
|
|||
23: Tested after it has been merged into the -mm patchset to make sure
|
||||
that it still works with all of the other queued patches and various
|
||||
changes in the VM, VFS, and other subsystems.
|
||||
|
||||
24: Avoid whitespace damage such as indenting with spaces or whitespace
|
||||
at the end of lines. You can test this by feeding the patch to
|
||||
"git apply --check --whitespace=error-all"
|
||||
|
||||
25: Check your patch for general style as detailed in
|
||||
Documentation/CodingStyle. Check for trivial violations with the
|
||||
patch style checker prior to submission (scripts/checkpatch.pl).
|
||||
You should be able to justify all violations that remain in
|
||||
your patch.
|
||||
|
|
|
@ -168,7 +168,7 @@ int get_family_id(int sd)
|
|||
char buf[256];
|
||||
} ans;
|
||||
|
||||
int id, rc;
|
||||
int id = 0, rc;
|
||||
struct nlattr *na;
|
||||
int rep_len;
|
||||
|
||||
|
@ -209,7 +209,7 @@ void print_delayacct(struct taskstats *t)
|
|||
void task_context_switch_counts(struct taskstats *t)
|
||||
{
|
||||
printf("\n\nTask %15s%15s\n"
|
||||
" %15lu%15lu\n",
|
||||
" %15llu%15llu\n",
|
||||
"voluntary", "nonvoluntary",
|
||||
t->nvcsw, t->nivcsw);
|
||||
}
|
||||
|
@ -399,7 +399,7 @@ int main(int argc, char *argv[])
|
|||
goto done;
|
||||
}
|
||||
|
||||
PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
|
||||
PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n",
|
||||
sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
Linux supports two methods of overriding the BIOS DSDT:
|
||||
|
||||
CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel.
|
||||
|
||||
CONFIG_ACPI_CUSTOM_DSDT_INITRD adds the image to the initrd.
|
||||
|
||||
When to use these methods is described in detail on the
|
||||
Linux/ACPI home page:
|
||||
http://www.lesswatts.org/projects/acpi/overridingDSDT.php
|
||||
|
||||
Note that if both options are used, the DSDT supplied
|
||||
by the INITRD method takes precedence.
|
||||
|
||||
Documentation/initramfs-add-dsdt.sh is provided for convenience
|
||||
for use with the CONFIG_ACPI_CUSTOM_DSDT_INITRD method.
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash
|
||||
# Adds a DSDT file to the initrd (if it's an initramfs)
|
||||
# first argument is the name of archive
|
||||
# second argument is the name of the file to add
|
||||
# The file will be copied as /DSDT.aml
|
||||
|
||||
# 20060126: fix "Premature end of file" with some old cpio (Roland Robic)
|
||||
# 20060205: this time it should really work
|
||||
|
||||
# check the arguments
|
||||
if [ $# -ne 2 ]; then
|
||||
program_name=$(basename $0)
|
||||
echo "\
|
||||
$program_name: too few arguments
|
||||
Usage: $program_name initrd-name.img DSDT-to-add.aml
|
||||
Adds a DSDT file to an initrd (in initramfs format)
|
||||
|
||||
initrd-name.img: filename of the initrd in initramfs format
|
||||
DSDT-to-add.aml: filename of the DSDT file to add
|
||||
" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# we should check it's an initramfs
|
||||
|
||||
tempcpio=$(mktemp -d)
|
||||
# cleanup on exit, hangup, interrupt, quit, termination
|
||||
trap 'rm -rf $tempcpio' 0 1 2 3 15
|
||||
|
||||
# extract the archive
|
||||
gunzip -c "$1" > "$tempcpio"/initramfs.cpio || exit 1
|
||||
|
||||
# copy the DSDT file at the root of the directory so that we can call it "/DSDT.aml"
|
||||
cp -f "$2" "$tempcpio"/DSDT.aml
|
||||
|
||||
# add the file
|
||||
cd "$tempcpio"
|
||||
(echo DSDT.aml | cpio --quiet -H newc -o -A -O "$tempcpio"/initramfs.cpio) || exit 1
|
||||
cd "$OLDPWD"
|
||||
|
||||
# re-compress the archive
|
||||
gzip -c "$tempcpio"/initramfs.cpio > "$1"
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
/sys/module/acpi/parameters/:
|
||||
|
||||
trace_method_name
|
||||
The AML method name that the user wants to trace
|
||||
|
||||
trace_debug_layer
|
||||
The temporary debug_layer used when tracing the method.
|
||||
Using 0xffffffff by default if it is 0.
|
||||
|
||||
trace_debug_level
|
||||
The temporary debug_level used when tracing the method.
|
||||
Using 0x00ffffff by default if it is 0.
|
||||
|
||||
trace_state
|
||||
The status of the tracing feature.
|
||||
|
||||
"enabled" means this feature is enabled
|
||||
and the AML method is traced every time it's executed.
|
||||
|
||||
"1" means this feature is enabled and the AML method
|
||||
will only be traced during the next execution.
|
||||
|
||||
"disabled" means this feature is disabled.
|
||||
Users can enable/disable this debug tracing feature by
|
||||
"echo string > /sys/module/acpi/parameters/trace_state".
|
||||
"string" should be one of "enable", "disable" and "1".
|
|
@ -29,6 +29,8 @@ rm -f $dir/interfaces
|
|||
mknod -m 0200 $dir/interfaces c $MAJOR 4
|
||||
rm -f $dir/revalidate
|
||||
mknod -m 0200 $dir/revalidate c $MAJOR 5
|
||||
rm -f $dir/flush
|
||||
mknod -m 0200 $dir/flush c $MAJOR 6
|
||||
|
||||
export n_partitions
|
||||
mkshelf=`echo $0 | sed 's!mkdevs!mkshelf!'`
|
||||
|
|
|
@ -23,7 +23,10 @@ fi
|
|||
# /etc/udev/rules.d
|
||||
#
|
||||
rules_d="`sed -n '/^udev_rules=/{ s!udev_rules=!!; s!\"!!g; p; }' $conf`"
|
||||
if test -z "$rules_d" || test ! -d "$rules_d"; then
|
||||
if test -z "$rules_d" ; then
|
||||
rules_d=/etc/udev/rules.d
|
||||
fi
|
||||
if test ! -d "$rules_d"; then
|
||||
echo "$me Error: cannot find udev rules directory" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# These rules tell udev what device nodes to create for aoe support.
|
||||
# They may be installed along the following lines (adjusted to what
|
||||
# you see on your system).
|
||||
# They may be installed along the following lines. Check the section
|
||||
# 8 udev manpage to see whether your udev supports SUBSYSTEM, and
|
||||
# whether it uses one or two equal signs for SUBSYSTEM and KERNEL.
|
||||
#
|
||||
# ecashin@makki ~$ su
|
||||
# Password:
|
||||
|
@ -15,10 +16,11 @@
|
|||
#
|
||||
|
||||
# aoe char devices
|
||||
SUBSYSTEM="aoe", KERNEL="discover", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM="aoe", KERNEL="err", NAME="etherd/%k", GROUP="disk", MODE="0440"
|
||||
SUBSYSTEM="aoe", KERNEL="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM="aoe", KERNEL="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440"
|
||||
SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220"
|
||||
|
||||
# aoe block devices
|
||||
KERNEL="etherd*", NAME="%k", GROUP="disk"
|
||||
KERNEL=="etherd*", NAME="%k", GROUP="disk"
|
||||
|
|
|
@ -456,7 +456,7 @@ methods are create/destroy. Any others that are null are presumed to
|
|||
be successful no-ops.
|
||||
|
||||
struct cgroup_subsys_state *create(struct cgroup *cont)
|
||||
LL=cgroup_mutex
|
||||
(cgroup_mutex held by caller)
|
||||
|
||||
Called to create a subsystem state object for a cgroup. The
|
||||
subsystem should allocate its subsystem state object for the passed
|
||||
|
@ -471,14 +471,19 @@ it's the root of the hierarchy) and may be an appropriate place for
|
|||
initialization code.
|
||||
|
||||
void destroy(struct cgroup *cont)
|
||||
LL=cgroup_mutex
|
||||
(cgroup_mutex held by caller)
|
||||
|
||||
The cgroup system is about to destroy the passed cgroup; the
|
||||
subsystem should do any necessary cleanup
|
||||
The cgroup system is about to destroy the passed cgroup; the subsystem
|
||||
should do any necessary cleanup and free its subsystem state
|
||||
object. By the time this method is called, the cgroup has already been
|
||||
unlinked from the file system and from the child list of its parent;
|
||||
cgroup->parent is still valid. (Note - can also be called for a
|
||||
newly-created cgroup if an error occurs after this subsystem's
|
||||
create() method has been called for the new cgroup).
|
||||
|
||||
int can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
|
||||
struct task_struct *task)
|
||||
LL=cgroup_mutex
|
||||
(cgroup_mutex held by caller)
|
||||
|
||||
Called prior to moving a task into a cgroup; if the subsystem
|
||||
returns an error, this will abort the attach operation. If a NULL
|
||||
|
@ -489,25 +494,20 @@ remain valid while the caller holds cgroup_mutex.
|
|||
|
||||
void attach(struct cgroup_subsys *ss, struct cgroup *cont,
|
||||
struct cgroup *old_cont, struct task_struct *task)
|
||||
LL=cgroup_mutex
|
||||
|
||||
|
||||
Called after the task has been attached to the cgroup, to allow any
|
||||
post-attachment activity that requires memory allocations or blocking.
|
||||
|
||||
void fork(struct cgroup_subsy *ss, struct task_struct *task)
|
||||
LL=callback_mutex, maybe read_lock(tasklist_lock)
|
||||
|
||||
Called when a task is forked into a cgroup. Also called during
|
||||
registration for all existing tasks.
|
||||
|
||||
void exit(struct cgroup_subsys *ss, struct task_struct *task)
|
||||
LL=callback_mutex
|
||||
|
||||
Called during task exit
|
||||
|
||||
int populate(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||
LL=none
|
||||
|
||||
Called after creation of a cgroup to allow a subsystem to populate
|
||||
the cgroup directory with file entries. The subsystem should make
|
||||
|
@ -524,7 +524,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set
|
|||
up.
|
||||
|
||||
void bind(struct cgroup_subsys *ss, struct cgroup *root)
|
||||
LL=callback_mutex
|
||||
(cgroup_mutex held by caller)
|
||||
|
||||
Called when a cgroup subsystem is rebound to a different hierarchy
|
||||
and root cgroup. Currently this will only involve movement between
|
||||
|
|
|
@ -0,0 +1,279 @@
|
|||
Memory Controller
|
||||
|
||||
Salient features
|
||||
|
||||
a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages
|
||||
b. The infrastructure allows easy addition of other types of memory to control
|
||||
c. Provides *zero overhead* for non memory controller users
|
||||
d. Provides a double LRU: global memory pressure causes reclaim from the
|
||||
global LRU; a cgroup on hitting a limit, reclaims from the per
|
||||
cgroup LRU
|
||||
|
||||
NOTE: Swap Cache (unmapped) is not accounted now.
|
||||
|
||||
Benefits and Purpose of the memory controller
|
||||
|
||||
The memory controller isolates the memory behaviour of a group of tasks
|
||||
from the rest of the system. The article on LWN [12] mentions some probable
|
||||
uses of the memory controller. The memory controller can be used to
|
||||
|
||||
a. Isolate an application or a group of applications
|
||||
Memory hungry applications can be isolated and limited to a smaller
|
||||
amount of memory.
|
||||
b. Create a cgroup with limited amount of memory, this can be used
|
||||
as a good alternative to booting with mem=XXXX.
|
||||
c. Virtualization solutions can control the amount of memory they want
|
||||
to assign to a virtual machine instance.
|
||||
d. A CD/DVD burner could control the amount of memory used by the
|
||||
rest of the system to ensure that burning does not fail due to lack
|
||||
of available memory.
|
||||
e. There are several other use cases, find one or use the controller just
|
||||
for fun (to learn and hack on the VM subsystem).
|
||||
|
||||
1. History
|
||||
|
||||
The memory controller has a long history. A request for comments for the memory
|
||||
controller was posted by Balbir Singh [1]. At the time the RFC was posted
|
||||
there were several implementations for memory control. The goal of the
|
||||
RFC was to build consensus and agreement for the minimal features required
|
||||
for memory control. The first RSS controller was posted by Balbir Singh[2]
|
||||
in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
|
||||
RSS controller. At OLS, at the resource management BoF, everyone suggested
|
||||
that we handle both page cache and RSS together. Another request was raised
|
||||
to allow user space handling of OOM. The current memory controller is
|
||||
at version 6; it combines both mapped (RSS) and unmapped Page
|
||||
Cache Control [11].
|
||||
|
||||
2. Memory Control
|
||||
|
||||
Memory is a unique resource in the sense that it is present in a limited
|
||||
amount. If a task requires a lot of CPU processing, the task can spread
|
||||
its processing over a period of hours, days, months or years, but with
|
||||
memory, the same physical memory needs to be reused to accomplish the task.
|
||||
|
||||
The memory controller implementation has been divided into phases. These
|
||||
are:
|
||||
|
||||
1. Memory controller
|
||||
2. mlock(2) controller
|
||||
3. Kernel user memory accounting and slab control
|
||||
4. user mappings length controller
|
||||
|
||||
The memory controller is the first controller developed.
|
||||
|
||||
2.1. Design
|
||||
|
||||
The core of the design is a counter called the res_counter. The res_counter
|
||||
tracks the current memory usage and limit of the group of processes associated
|
||||
with the controller. Each cgroup has a memory controller specific data
|
||||
structure (mem_cgroup) associated with it.
|
||||
|
||||
2.2. Accounting
|
||||
|
||||
+--------------------+
|
||||
| mem_cgroup |
|
||||
| (res_counter) |
|
||||
+--------------------+
|
||||
/ ^ \
|
||||
/ | \
|
||||
+---------------+ | +---------------+
|
||||
| mm_struct | |.... | mm_struct |
|
||||
| | | | |
|
||||
+---------------+ | +---------------+
|
||||
|
|
||||
+ --------------+
|
||||
|
|
||||
+---------------+ +------+--------+
|
||||
| page +----------> page_cgroup|
|
||||
| | | |
|
||||
+---------------+ +---------------+
|
||||
|
||||
(Figure 1: Hierarchy of Accounting)
|
||||
|
||||
|
||||
Figure 1 shows the important aspects of the controller
|
||||
|
||||
1. Accounting happens per cgroup
|
||||
2. Each mm_struct knows about which cgroup it belongs to
|
||||
3. Each page has a pointer to the page_cgroup, which in turn knows the
|
||||
cgroup it belongs to
|
||||
|
||||
The accounting is done as follows: mem_cgroup_charge() is invoked to setup
|
||||
the necessary data structures and check if the cgroup that is being charged
|
||||
is over its limit. If it is then reclaim is invoked on the cgroup.
|
||||
More details can be found in the reclaim section of this document.
|
||||
If everything goes well, a page meta-data-structure called page_cgroup is
|
||||
allocated and associated with the page. This routine also adds the page to
|
||||
the per cgroup LRU.
|
||||
|
||||
2.2.1 Accounting details
|
||||
|
||||
All mapped pages (RSS) and unmapped user pages (Page Cache) are accounted.
|
||||
RSS pages are accounted at the time of page_add_*_rmap() unless they've already
|
||||
been accounted for earlier. A file page will be accounted for as Page Cache;
|
||||
it's mapped into the page tables of a process, duplicate accounting is carefully
|
||||
avoided. Page Cache pages are accounted at the time of add_to_page_cache().
|
||||
The corresponding routines that remove a page from the page tables or removes
|
||||
a page from Page Cache is used to decrement the accounting counters of the
|
||||
cgroup.
|
||||
|
||||
2.3 Shared Page Accounting
|
||||
|
||||
Shared pages are accounted on the basis of the first touch approach. The
|
||||
cgroup that first touches a page is accounted for the page. The principle
|
||||
behind this approach is that a cgroup that aggressively uses a shared
|
||||
page will eventually get charged for it (once it is uncharged from
|
||||
the cgroup that brought it in -- this will happen on memory pressure).
|
||||
|
||||
2.4 Reclaim
|
||||
|
||||
Each cgroup maintains a per cgroup LRU that consists of an active
|
||||
and inactive list. When a cgroup goes over its limit, we first try
|
||||
to reclaim memory from the cgroup so as to make space for the new
|
||||
pages that the cgroup has touched. If the reclaim is unsuccessful,
|
||||
an OOM routine is invoked to select and kill the bulkiest task in the
|
||||
cgroup.
|
||||
|
||||
The reclaim algorithm has not been modified for cgroups, except that
|
||||
pages that are selected for reclaiming come from the per cgroup LRU
|
||||
list.
|
||||
|
||||
2. Locking
|
||||
|
||||
The memory controller uses the following hierarchy
|
||||
|
||||
1. zone->lru_lock is used for selecting pages to be isolated
|
||||
2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone)
|
||||
3. lock_page_cgroup() is used to protect page->page_cgroup
|
||||
|
||||
3. User Interface
|
||||
|
||||
0. Configuration
|
||||
|
||||
a. Enable CONFIG_CGROUPS
|
||||
b. Enable CONFIG_RESOURCE_COUNTERS
|
||||
c. Enable CONFIG_CGROUP_MEM_CONT
|
||||
|
||||
1. Prepare the cgroups
|
||||
# mkdir -p /cgroups
|
||||
# mount -t cgroup none /cgroups -o memory
|
||||
|
||||
2. Make the new group and move bash into it
|
||||
# mkdir /cgroups/0
|
||||
# echo $$ > /cgroups/0/tasks
|
||||
|
||||
Since now we're in the 0 cgroup,
|
||||
We can alter the memory limit:
|
||||
# echo -n 4M > /cgroups/0/memory.limit_in_bytes
|
||||
|
||||
NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
|
||||
mega or gigabytes.
|
||||
|
||||
# cat /cgroups/0/memory.limit_in_bytes
|
||||
4194304 Bytes
|
||||
|
||||
NOTE: The interface has now changed to display the usage in bytes
|
||||
instead of pages
|
||||
|
||||
We can check the usage:
|
||||
# cat /cgroups/0/memory.usage_in_bytes
|
||||
1216512 Bytes
|
||||
|
||||
A successful write to this file does not guarantee a successful set of
|
||||
this limit to the value written into the file. This can be due to a
|
||||
number of factors, such as rounding up to page boundaries or the total
|
||||
availability of memory on the system. The user is required to re-read
|
||||
this file after a write to guarantee the value committed by the kernel.
|
||||
|
||||
# echo -n 1 > memory.limit_in_bytes
|
||||
# cat memory.limit_in_bytes
|
||||
4096 Bytes
|
||||
|
||||
The memory.failcnt field gives the number of times that the cgroup limit was
|
||||
exceeded.
|
||||
|
||||
The memory.stat file gives accounting information. Now, the number of
|
||||
caches, RSS and Active pages/Inactive pages are shown.
|
||||
|
||||
The memory.force_empty gives an interface to drop *all* charges by force.
|
||||
|
||||
# echo -n 1 > memory.force_empty
|
||||
|
||||
will drop all charges in cgroup. Currently, this is maintained for test.
|
||||
|
||||
4. Testing
|
||||
|
||||
Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11].
|
||||
Apart from that v6 has been tested with several applications and regular
|
||||
daily use. The controller has also been tested on the PPC64, x86_64 and
|
||||
UML platforms.
|
||||
|
||||
4.1 Troubleshooting
|
||||
|
||||
Sometimes a user might find that the application under a cgroup is
|
||||
terminated. There are several causes for this:
|
||||
|
||||
1. The cgroup limit is too low (just too low to do anything useful)
|
||||
2. The user is using anonymous memory and swap is turned off or too low
|
||||
|
||||
A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
|
||||
some of the pages cached in the cgroup (page cache pages).
|
||||
|
||||
4.2 Task migration
|
||||
|
||||
When a task migrates from one cgroup to another, it's charge is not
|
||||
carried forward. The pages allocated from the original cgroup still
|
||||
remain charged to it, the charge is dropped when the page is freed or
|
||||
reclaimed.
|
||||
|
||||
4.3 Removing a cgroup
|
||||
|
||||
A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
|
||||
cgroup might have some charge associated with it, even though all
|
||||
tasks have migrated away from it. Such charges are automatically dropped at
|
||||
rmdir() if there are no tasks.
|
||||
|
||||
4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)?
|
||||
|
||||
The type of memory accounted by the cgroup can be limited to just
|
||||
mapped pages by writing "1" to memory.control_type field
|
||||
|
||||
echo -n 1 > memory.control_type
|
||||
|
||||
5. TODO
|
||||
|
||||
1. Add support for accounting huge pages (as a separate controller)
|
||||
2. Make per-cgroup scanner reclaim not-shared pages first
|
||||
3. Teach controller to account for shared-pages
|
||||
4. Start reclamation when the limit is lowered
|
||||
5. Start reclamation in the background when the limit is
|
||||
not yet hit but the usage is getting closer
|
||||
|
||||
Summary
|
||||
|
||||
Overall, the memory controller has been a stable controller and has been
|
||||
commented and discussed quite extensively in the community.
|
||||
|
||||
References
|
||||
|
||||
1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
|
||||
2. Singh, Balbir. Memory Controller (RSS Control),
|
||||
http://lwn.net/Articles/222762/
|
||||
3. Emelianov, Pavel. Resource controllers based on process cgroups
|
||||
http://lkml.org/lkml/2007/3/6/198
|
||||
4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
|
||||
http://lkml.org/lkml/2007/4/9/74
|
||||
5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
|
||||
http://lkml.org/lkml/2007/5/30/244
|
||||
6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
|
||||
7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
|
||||
subsystem (v3), http://lwn.net/Articles/235534/
|
||||
8. Singh, Balbir. RSS controller V2 test results (lmbench),
|
||||
http://lkml.org/lkml/2007/5/17/232
|
||||
9. Singh, Balbir. RSS controller V2 AIM9 results
|
||||
http://lkml.org/lkml/2007/5/18/1
|
||||
10. Singh, Balbir. Memory controller v6 results,
|
||||
http://lkml.org/lkml/2007/8/19/36
|
||||
11. Singh, Balbir. Memory controller v6, http://lkml.org/lkml/2007/8/17/69
|
||||
12. Corbet, Jonathan, Controlling memory use in cgroups,
|
||||
http://lwn.net/Articles/243795/
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
Supporting multiple CPU idle levels in kernel
|
||||
|
||||
cpuidle
|
||||
|
||||
General Information:
|
||||
|
||||
Various CPUs today support multiple idle levels that are differentiated
|
||||
by varying exit latencies and power consumption during idle.
|
||||
cpuidle is a generic in-kernel infrastructure that separates
|
||||
idle policy (governor) from idle mechanism (driver) and provides a
|
||||
standardized infrastructure to support independent development of
|
||||
governors and drivers.
|
||||
|
||||
cpuidle resides under drivers/cpuidle.
|
||||
|
||||
Boot options:
|
||||
"cpuidle_sysfs_switch"
|
||||
enables current_governor interface in /sys/devices/system/cpu/cpuidle/,
|
||||
which can be used to switch governors at run time. This boot option
|
||||
is meant for developer testing only. In normal usage, kernel picks the
|
||||
best governor based on governor ratings.
|
||||
SEE ALSO: sysfs.txt in this directory.
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
|
||||
Supporting multiple CPU idle levels in kernel
|
||||
|
||||
cpuidle drivers
|
||||
|
||||
|
||||
|
||||
|
||||
cpuidle driver hooks into the cpuidle infrastructure and handles the
|
||||
architecture/platform dependent part of CPU idle states. Driver
|
||||
provides the platform idle state detection capability and also
|
||||
has mechanisms in place to support actual entry-exit into CPU idle states.
|
||||
|
||||
cpuidle driver initializes the cpuidle_device structure for each CPU device
|
||||
and registers with cpuidle using cpuidle_register_device.
|
||||
|
||||
It can also support the dynamic changes (like battery <-> AC), by using
|
||||
cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device,
|
||||
cpuidle_resume_and_unlock.
|
||||
|
||||
Interfaces:
|
||||
extern int cpuidle_register_driver(struct cpuidle_driver *drv);
|
||||
extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
|
||||
extern int cpuidle_register_device(struct cpuidle_device *dev);
|
||||
extern void cpuidle_unregister_device(struct cpuidle_device *dev);
|
||||
|
||||
extern void cpuidle_pause_and_lock(void);
|
||||
extern void cpuidle_resume_and_unlock(void);
|
||||
extern int cpuidle_enable_device(struct cpuidle_device *dev);
|
||||
extern void cpuidle_disable_device(struct cpuidle_device *dev);
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
|
||||
|
||||
Supporting multiple CPU idle levels in kernel
|
||||
|
||||
cpuidle governors
|
||||
|
||||
|
||||
|
||||
|
||||
cpuidle governor is policy routine that decides what idle state to enter at
|
||||
any given time. cpuidle core uses different callbacks to the governor.
|
||||
|
||||
* enable() to enable governor for a particular device
|
||||
* disable() to disable governor for a particular device
|
||||
* select() to select an idle state to enter
|
||||
* reflect() called after returning from the idle state, which can be used
|
||||
by the governor for some record keeping.
|
||||
|
||||
More than one governor can be registered at the same time and
|
||||
users can switch between drivers using /sysfs interface (when enabled).
|
||||
More than one governor part is supported for developers to easily experiment
|
||||
with different governors. By default, most optimal governor based on your
|
||||
kernel configuration and platform will be selected by cpuidle.
|
||||
|
||||
Interfaces:
|
||||
extern int cpuidle_register_governor(struct cpuidle_governor *gov);
|
||||
extern void cpuidle_unregister_governor(struct cpuidle_governor *gov);
|
||||
struct cpuidle_governor
|
|
@ -0,0 +1,79 @@
|
|||
|
||||
|
||||
Supporting multiple CPU idle levels in kernel
|
||||
|
||||
cpuidle sysfs
|
||||
|
||||
System global cpuidle related information and tunables are under
|
||||
/sys/devices/system/cpu/cpuidle
|
||||
|
||||
The current interfaces in this directory has self-explanatory names:
|
||||
* current_driver
|
||||
* current_governor_ro
|
||||
|
||||
With cpuidle_sysfs_switch boot option (meant for developer testing)
|
||||
following objects are visible instead.
|
||||
* current_driver
|
||||
* available_governors
|
||||
* current_governor
|
||||
In this case users can switch the governor at run time by writing
|
||||
to current_governor.
|
||||
|
||||
|
||||
Per logical CPU specific cpuidle information are under
|
||||
/sys/devices/system/cpu/cpuX/cpuidle
|
||||
for each online cpu X
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
# ls -lR /sys/devices/system/cpu/cpu0/cpuidle/
|
||||
/sys/devices/system/cpu/cpu0/cpuidle/:
|
||||
total 0
|
||||
drwxr-xr-x 2 root root 0 Feb 8 10:42 state0
|
||||
drwxr-xr-x 2 root root 0 Feb 8 10:42 state1
|
||||
drwxr-xr-x 2 root root 0 Feb 8 10:42 state2
|
||||
drwxr-xr-x 2 root root 0 Feb 8 10:42 state3
|
||||
|
||||
/sys/devices/system/cpu/cpu0/cpuidle/state0:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
|
||||
|
||||
/sys/devices/system/cpu/cpu0/cpuidle/state1:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
|
||||
|
||||
/sys/devices/system/cpu/cpu0/cpuidle/state2:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
|
||||
|
||||
/sys/devices/system/cpu/cpu0/cpuidle/state3:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
|
||||
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
|
||||
* desc : Small description about the idle state (string)
|
||||
* latency : Latency to exit out of this idle state (in microseconds)
|
||||
* name : Name of the idle state (string)
|
||||
* power : Power consumed while in this idle state (in milliwatts)
|
||||
* time : Total time spent in this idle state (in microseconds)
|
||||
* usage : Number of times this state was entered (count)
|
|
@ -523,21 +523,14 @@ from one cpuset to another, then the kernel will adjust the tasks
|
|||
memory placement, as above, the next time that the kernel attempts
|
||||
to allocate a page of memory for that task.
|
||||
|
||||
If a cpuset has its CPUs modified, then each task using that
|
||||
cpuset does _not_ change its behavior automatically. In order to
|
||||
minimize the impact on the critical scheduling code in the kernel,
|
||||
tasks will continue to use their prior CPU placement until they
|
||||
are rebound to their cpuset, by rewriting their pid to the 'tasks'
|
||||
file of their cpuset. If a task had been bound to some subset of its
|
||||
cpuset using the sched_setaffinity() call, and if any of that subset
|
||||
is still allowed in its new cpuset settings, then the task will be
|
||||
restricted to the intersection of the CPUs it was allowed on before,
|
||||
and its new cpuset CPU placement. If, on the other hand, there is
|
||||
no overlap between a tasks prior placement and its new cpuset CPU
|
||||
placement, then the task will be allowed to run on any CPU allowed
|
||||
in its new cpuset. If a task is moved from one cpuset to another,
|
||||
its CPU placement is updated in the same way as if the tasks pid is
|
||||
rewritten to the 'tasks' file of its current cpuset.
|
||||
If a cpuset has its 'cpus' modified, then each task in that cpuset
|
||||
will have its allowed CPU placement changed immediately. Similarly,
|
||||
if a tasks pid is written to a cpusets 'tasks' file, in either its
|
||||
current cpuset or another cpuset, then its allowed CPU placement is
|
||||
changed immediately. If such a task had been bound to some subset
|
||||
of its cpuset using the sched_setaffinity() call, the task will be
|
||||
allowed to run on any CPU allowed in its new cpuset, negating the
|
||||
affect of the prior sched_setaffinity() call.
|
||||
|
||||
In summary, the memory placement of a task whose cpuset is changed is
|
||||
updated by the kernel, on the next allocation of a page for that task,
|
||||
|
|
|
@ -170,7 +170,6 @@ Sylpheed (GUI)
|
|||
|
||||
- Works well for inlining text (or using attachments).
|
||||
- Allows use of an external editor.
|
||||
- Not good for IMAP.
|
||||
- Is slow on large folders.
|
||||
- Won't do TLS SMTP auth over a non-SSL connection.
|
||||
- Has a helpful ruler bar in the compose window.
|
||||
|
|
|
@ -7,10 +7,10 @@ IO. The following example may be a useful explanation of how one such setup
|
|||
works:
|
||||
|
||||
- userspace app like Xfbdev mmaps framebuffer
|
||||
- deferred IO and driver sets up nopage and page_mkwrite handlers
|
||||
- deferred IO and driver sets up fault and page_mkwrite handlers
|
||||
- userspace app tries to write to mmaped vaddress
|
||||
- we get pagefault and reach nopage handler
|
||||
- nopage handler finds and returns physical page
|
||||
- we get pagefault and reach fault handler
|
||||
- fault handler finds and returns physical page
|
||||
- we get page_mkwrite where we add this page to a list
|
||||
- schedule a workqueue task to be run after a delay
|
||||
- app continues writing to that page with no additional cost. this is
|
||||
|
|
|
@ -6,14 +6,6 @@ be removed from this file.
|
|||
|
||||
---------------------------
|
||||
|
||||
What: MXSER
|
||||
When: December 2007
|
||||
Why: Old mxser driver is obsoleted by the mxser_new. Give it some time yet
|
||||
and remove it.
|
||||
Who: Jiri Slaby <jirislaby@gmail.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: dev->power.power_state
|
||||
When: July 2007
|
||||
Why: Broken design for runtime control over driver power states, confusing
|
||||
|
@ -107,17 +99,6 @@ Who: Eric Biederman <ebiederm@xmission.com>
|
|||
|
||||
---------------------------
|
||||
|
||||
What: a.out interpreter support for ELF executables
|
||||
When: 2.6.25
|
||||
Files: fs/binfmt_elf.c
|
||||
Why: Using a.out interpreters for ELF executables was a feature for
|
||||
transition from a.out to ELF. But now it is unlikely to be still
|
||||
needed anymore and removing it would simplify the hairy ELF
|
||||
loader code.
|
||||
Who: Andi Kleen <ak@suse.de>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: remove EXPORT_SYMBOL(kernel_thread)
|
||||
When: August 2006
|
||||
Files: arch/*/kernel/*_ksyms.c
|
||||
|
@ -130,15 +111,6 @@ Who: Christoph Hellwig <hch@lst.de>
|
|||
|
||||
---------------------------
|
||||
|
||||
What: CONFIG_FORCED_INLINING
|
||||
When: June 2006
|
||||
Why: Config option is there to see if gcc is good enough. (in january
|
||||
2006). If it is, the behavior should just be the default. If it's not,
|
||||
the option should just go away entirely.
|
||||
Who: Arjan van de Ven
|
||||
|
||||
---------------------------
|
||||
|
||||
What: eepro100 network driver
|
||||
When: January 2007
|
||||
Why: replaced by the e100 driver
|
||||
|
@ -200,21 +172,6 @@ Who: Len Brown <len.brown@intel.com>
|
|||
|
||||
---------------------------
|
||||
|
||||
What: 'time' kernel boot parameter
|
||||
When: January 2008
|
||||
Why: replaced by 'printk.time=<value>' so that printk timestamps can be
|
||||
enabled or disabled as needed
|
||||
Who: Randy Dunlap <randy.dunlap@oracle.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: drivers depending on OSS_OBSOLETE
|
||||
When: options in 2.6.23, code in 2.6.25
|
||||
Why: obsolete OSS drivers
|
||||
Who: Adrian Bunk <bunk@stusta.de>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: libata spindown skipping and warning
|
||||
When: Dec 2008
|
||||
Why: Some halt(8) implementations synchronize caches for and spin
|
||||
|
@ -338,3 +295,14 @@ Why: The support code for the old firmware hurts code readability/maintainabilit
|
|||
and slightly hurts runtime performance. Bugfixes for the old firmware
|
||||
are not provided by Broadcom anymore.
|
||||
Who: Michael Buesch <mb@bu3sch.de>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: Solaris/SunOS syscall and binary support on Sparc
|
||||
When: 2.6.26
|
||||
Why: Largely unmaintained and almost entirely unused. File system
|
||||
layering used to divert library and dynamic linker searches to
|
||||
/usr/gnemul is extremely buggy and unfixable. Making it work
|
||||
is largely pointless as without a lot of work only the most
|
||||
trivial of Solaris binaries can work with the emulation code.
|
||||
Who: David S. Miller <davem@davemloft.net>
|
||||
|
|
|
@ -32,6 +32,8 @@ directory-locking
|
|||
- info about the locking scheme used for directory operations.
|
||||
dlmfs.txt
|
||||
- info on the userspace interface to the OCFS2 DLM.
|
||||
dnotify.txt
|
||||
- info about directory notification in Linux.
|
||||
ecryptfs.txt
|
||||
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
|
||||
ext2.txt
|
||||
|
@ -80,6 +82,8 @@ relay.txt
|
|||
- info on relay, for efficient streaming from kernel to user space.
|
||||
romfs.txt
|
||||
- description of the ROMFS filesystem.
|
||||
sharedsubtree.txt
|
||||
- a description of shared subtrees for namespaces.
|
||||
smbfs.txt
|
||||
- info on using filesystems with the SMB protocol (Win 3.11 and NT).
|
||||
spufs.txt
|
||||
|
|
|
@ -90,7 +90,6 @@ of the locking scheme for directory operations.
|
|||
prototypes:
|
||||
struct inode *(*alloc_inode)(struct super_block *sb);
|
||||
void (*destroy_inode)(struct inode *);
|
||||
void (*read_inode) (struct inode *);
|
||||
void (*dirty_inode) (struct inode *);
|
||||
int (*write_inode) (struct inode *, int);
|
||||
void (*put_inode) (struct inode *);
|
||||
|
@ -114,7 +113,6 @@ locking rules:
|
|||
BKL s_lock s_umount
|
||||
alloc_inode: no no no
|
||||
destroy_inode: no
|
||||
read_inode: no (see below)
|
||||
dirty_inode: no (must not sleep)
|
||||
write_inode: no
|
||||
put_inode: no
|
||||
|
@ -133,7 +131,6 @@ show_options: no (vfsmount->sem)
|
|||
quota_read: no no no (see below)
|
||||
quota_write: no no no (see below)
|
||||
|
||||
->read_inode() is not a method - it's a callback used in iget().
|
||||
->remount_fs() will have the s_umount lock if it's already mounted.
|
||||
When called from get_sb_single, it does NOT have the s_umount lock.
|
||||
->quota_read() and ->quota_write() functions are both guaranteed to
|
||||
|
|
|
@ -69,24 +69,24 @@ Example
|
|||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
static volatile int event_fd;
|
||||
|
||||
|
||||
static void handler(int sig, siginfo_t *si, void *data)
|
||||
{
|
||||
event_fd = si->si_fd;
|
||||
}
|
||||
|
||||
|
||||
int main(void)
|
||||
{
|
||||
struct sigaction act;
|
||||
int fd;
|
||||
|
||||
|
||||
act.sa_sigaction = handler;
|
||||
sigemptyset(&act.sa_mask);
|
||||
act.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGRTMIN + 1, &act, NULL);
|
||||
|
||||
|
||||
fd = open(".", O_RDONLY);
|
||||
fcntl(fd, F_SETSIG, SIGRTMIN + 1);
|
||||
fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
|
|
@ -24,6 +24,7 @@ Mount options unique to the isofs filesystem.
|
|||
map=normal Map non-Rock Ridge filenames to lower case
|
||||
map=acorn As map=normal but also apply Acorn extensions if present
|
||||
mode=xxx Sets the permissions on files to xxx
|
||||
dmode=xxx Sets the permissions on directories to xxx
|
||||
nojoliet Ignore Joliet extensions if they are present.
|
||||
norock Ignore Rock Ridge extensions if they are present.
|
||||
hide Completely strip hidden files from the file system.
|
||||
|
|
|
@ -34,8 +34,8 @@ FOO_I(inode) (see in-tree filesystems for examples).
|
|||
|
||||
Make them ->alloc_inode and ->destroy_inode in your super_operations.
|
||||
|
||||
Keep in mind that now you need explicit initialization of private data -
|
||||
typically in ->read_inode() and after getting an inode from new_inode().
|
||||
Keep in mind that now you need explicit initialization of private data
|
||||
typically between calling iget_locked() and unlocking the inode.
|
||||
|
||||
At some point that will become mandatory.
|
||||
|
||||
|
@ -173,10 +173,10 @@ should be a non-blocking function that initializes those parts of a
|
|||
newly created inode to allow the test function to succeed. 'data' is
|
||||
passed as an opaque value to both test and set functions.
|
||||
|
||||
When the inode has been created by iget5_locked(), it will be returned with
|
||||
the I_NEW flag set and will still be locked. read_inode has not been
|
||||
called so the file system still has to finalize the initialization. Once
|
||||
the inode is initialized it must be unlocked by calling unlock_new_inode().
|
||||
When the inode has been created by iget5_locked(), it will be returned with the
|
||||
I_NEW flag set and will still be locked. The filesystem then needs to finalize
|
||||
the initialization. Once the inode is initialized it must be unlocked by
|
||||
calling unlock_new_inode().
|
||||
|
||||
The filesystem is responsible for setting (and possibly testing) i_ino
|
||||
when appropriate. There is also a simpler iget_locked function that
|
||||
|
@ -184,11 +184,19 @@ just takes the superblock and inode number as arguments and does the
|
|||
test and set for you.
|
||||
|
||||
e.g.
|
||||
inode = iget_locked(sb, ino);
|
||||
if (inode->i_state & I_NEW) {
|
||||
read_inode_from_disk(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
inode = iget_locked(sb, ino);
|
||||
if (inode->i_state & I_NEW) {
|
||||
err = read_inode_from_disk(inode);
|
||||
if (err < 0) {
|
||||
iget_failed(inode);
|
||||
return err;
|
||||
}
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
Note that if the process of setting up a new inode fails, then iget_failed()
|
||||
should be called on the inode to render it dead, and an appropriate error
|
||||
should be passed back to the caller.
|
||||
|
||||
---
|
||||
[recommended]
|
||||
|
|
|
@ -1029,6 +1029,14 @@ nr_inodes
|
|||
Denotes the number of inodes the system has allocated. This number will
|
||||
grow and shrink dynamically.
|
||||
|
||||
nr_open
|
||||
-------
|
||||
|
||||
Denotes the maximum number of file-handles a process can
|
||||
allocate. Default value is 1024*1024 (1048576) which should be
|
||||
enough for most machines. Actual limit depends on RLIMIT_NOFILE
|
||||
resource limit.
|
||||
|
||||
nr_free_inodes
|
||||
--------------
|
||||
|
||||
|
@ -1315,13 +1323,28 @@ for writeout by the pdflush daemons. It is expressed in 100'ths of a second.
|
|||
Data which has been dirty in-memory for longer than this interval will be
|
||||
written out next time a pdflush daemon wakes up.
|
||||
|
||||
highmem_is_dirtyable
|
||||
--------------------
|
||||
|
||||
Only present if CONFIG_HIGHMEM is set.
|
||||
|
||||
This defaults to 0 (false), meaning that the ratios set above are calculated
|
||||
as a percentage of lowmem only. This protects against excessive scanning
|
||||
in page reclaim, swapping and general VM distress.
|
||||
|
||||
Setting this to 1 can be useful on 32 bit machines where you want to make
|
||||
random changes within an MMAPed file that is larger than your available
|
||||
lowmem without causing large quantities of random IO. Is is safe if the
|
||||
behavior of all programs running on the machine is known and memory will
|
||||
not be otherwise stressed.
|
||||
|
||||
legacy_va_layout
|
||||
----------------
|
||||
|
||||
If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel
|
||||
will use the legacy (2.4) layout for all processes.
|
||||
|
||||
lower_zone_protection
|
||||
lowmem_reserve_ratio
|
||||
---------------------
|
||||
|
||||
For some specialised workloads on highmem machines it is dangerous for
|
||||
|
@ -1341,25 +1364,71 @@ captured into pinned user memory.
|
|||
mechanism will also defend that region from allocations which could use
|
||||
highmem or lowmem).
|
||||
|
||||
The `lower_zone_protection' tunable determines how aggressive the kernel is
|
||||
in defending these lower zones. The default value is zero - no
|
||||
protection at all.
|
||||
The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
|
||||
in defending these lower zones.
|
||||
|
||||
If you have a machine which uses highmem or ISA DMA and your
|
||||
applications are using mlock(), or if you are running with no swap then
|
||||
you probably should increase the lower_zone_protection setting.
|
||||
you probably should change the lowmem_reserve_ratio setting.
|
||||
|
||||
The units of this tunable are fairly vague. It is approximately equal
|
||||
to "megabytes," so setting lower_zone_protection=100 will protect around 100
|
||||
megabytes of the lowmem zone from user allocations. It will also make
|
||||
those 100 megabytes unavailable for use by applications and by
|
||||
pagecache, so there is a cost.
|
||||
The lowmem_reserve_ratio is an array. You can see them by reading this file.
|
||||
-
|
||||
% cat /proc/sys/vm/lowmem_reserve_ratio
|
||||
256 256 32
|
||||
-
|
||||
Note: # of this elements is one fewer than number of zones. Because the highest
|
||||
zone's value is not necessary for following calculation.
|
||||
|
||||
The effects of this tunable may be observed by monitoring
|
||||
/proc/meminfo:LowFree. Write a single huge file and observe the point
|
||||
at which LowFree ceases to fall.
|
||||
But, these values are not used directly. The kernel calculates # of protection
|
||||
pages for each zones from them. These are shown as array of protection pages
|
||||
in /proc/zoneinfo like followings. (This is an example of x86-64 box).
|
||||
Each zone has an array of protection pages like this.
|
||||
|
||||
A reasonable value for lower_zone_protection is 100.
|
||||
-
|
||||
Node 0, zone DMA
|
||||
pages free 1355
|
||||
min 3
|
||||
low 3
|
||||
high 4
|
||||
:
|
||||
:
|
||||
numa_other 0
|
||||
protection: (0, 2004, 2004, 2004)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
pagesets
|
||||
cpu: 0 pcp: 0
|
||||
:
|
||||
-
|
||||
These protections are added to score to judge whether this zone should be used
|
||||
for page allocation or should be reclaimed.
|
||||
|
||||
In this example, if normal pages (index=2) are required to this DMA zone and
|
||||
pages_high is used for watermark, the kernel judges this zone should not be
|
||||
used because pages_free(1355) is smaller than watermark + protection[2]
|
||||
(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
|
||||
normal page requirement. If requirement is DMA zone(index=0), protection[0]
|
||||
(=0) is used.
|
||||
|
||||
zone[i]'s protection[j] is calculated by following exprssion.
|
||||
|
||||
(i < j):
|
||||
zone[i]->protection[j]
|
||||
= (total sums of present_pages from zone[i+1] to zone[j] on the node)
|
||||
/ lowmem_reserve_ratio[i];
|
||||
(i = j):
|
||||
(should not be protected. = 0;
|
||||
(i > j):
|
||||
(not necessary, but looks 0)
|
||||
|
||||
The default values of lowmem_reserve_ratio[i] are
|
||||
256 (if zone[i] means DMA or DMA32 zone)
|
||||
32 (others).
|
||||
As above expression, they are reciprocal number of ratio.
|
||||
256 means 1/256. # of protection pages becomes about "0.39%" of total present
|
||||
pages of higher zones on the node.
|
||||
|
||||
If you would like to protect more pages, smaller values are effective.
|
||||
The minimum value is 1 (1/1 -> 100%).
|
||||
|
||||
page-cluster
|
||||
------------
|
||||
|
|
|
@ -151,7 +151,7 @@ The get_sb() method has the following arguments:
|
|||
const char *dev_name: the device name we are mounting.
|
||||
|
||||
void *data: arbitrary mount options, usually comes as an ASCII
|
||||
string
|
||||
string (see "Mount Options" section)
|
||||
|
||||
struct vfsmount *mnt: a vfs-internal representation of a mount point
|
||||
|
||||
|
@ -182,7 +182,7 @@ A fill_super() method implementation has the following arguments:
|
|||
must initialize this properly.
|
||||
|
||||
void *data: arbitrary mount options, usually comes as an ASCII
|
||||
string
|
||||
string (see "Mount Options" section)
|
||||
|
||||
int silent: whether or not to be silent on error
|
||||
|
||||
|
@ -203,8 +203,6 @@ struct super_operations {
|
|||
struct inode *(*alloc_inode)(struct super_block *sb);
|
||||
void (*destroy_inode)(struct inode *);
|
||||
|
||||
void (*read_inode) (struct inode *);
|
||||
|
||||
void (*dirty_inode) (struct inode *);
|
||||
int (*write_inode) (struct inode *, int);
|
||||
void (*put_inode) (struct inode *);
|
||||
|
@ -242,15 +240,6 @@ or bottom half).
|
|||
->alloc_inode was defined and simply undoes anything done by
|
||||
->alloc_inode.
|
||||
|
||||
read_inode: this method is called to read a specific inode from the
|
||||
mounted filesystem. The i_ino member in the struct inode is
|
||||
initialized by the VFS to indicate which inode to read. Other
|
||||
members are filled in by this method.
|
||||
|
||||
You can set this to NULL and use iget5_locked() instead of iget()
|
||||
to read inodes. This is necessary for filesystems for which the
|
||||
inode number is not sufficient to identify an inode.
|
||||
|
||||
dirty_inode: this method is called by the VFS to mark an inode dirty.
|
||||
|
||||
write_inode: this method is called when the VFS needs to write an
|
||||
|
@ -302,15 +291,16 @@ or bottom half).
|
|||
|
||||
umount_begin: called when the VFS is unmounting a filesystem.
|
||||
|
||||
show_options: called by the VFS to show mount options for /proc/<pid>/mounts.
|
||||
show_options: called by the VFS to show mount options for
|
||||
/proc/<pid>/mounts. (see "Mount Options" section)
|
||||
|
||||
quota_read: called by the VFS to read from filesystem quota file.
|
||||
|
||||
quota_write: called by the VFS to write to filesystem quota file.
|
||||
|
||||
The read_inode() method is responsible for filling in the "i_op"
|
||||
field. This is a pointer to a "struct inode_operations" which
|
||||
describes the methods that can be performed on individual inodes.
|
||||
Whoever sets up the inode is responsible for filling in the "i_op" field. This
|
||||
is a pointer to a "struct inode_operations" which describes the methods that
|
||||
can be performed on individual inodes.
|
||||
|
||||
|
||||
The Inode Object
|
||||
|
@ -980,6 +970,49 @@ manipulate dentries:
|
|||
For further information on dentry locking, please refer to the document
|
||||
Documentation/filesystems/dentry-locking.txt.
|
||||
|
||||
Mount Options
|
||||
=============
|
||||
|
||||
Parsing options
|
||||
---------------
|
||||
|
||||
On mount and remount the filesystem is passed a string containing a
|
||||
comma separated list of mount options. The options can have either of
|
||||
these forms:
|
||||
|
||||
option
|
||||
option=value
|
||||
|
||||
The <linux/parser.h> header defines an API that helps parse these
|
||||
options. There are plenty of examples on how to use it in existing
|
||||
filesystems.
|
||||
|
||||
Showing options
|
||||
---------------
|
||||
|
||||
If a filesystem accepts mount options, it must define show_options()
|
||||
to show all the currently active options. The rules are:
|
||||
|
||||
- options MUST be shown which are not default or their values differ
|
||||
from the default
|
||||
|
||||
- options MAY be shown which are enabled by default or have their
|
||||
default value
|
||||
|
||||
Options used only internally between a mount helper and the kernel
|
||||
(such as file descriptors), or which only have an effect during the
|
||||
mounting (such as ones controlling the creation of a journal) are exempt
|
||||
from the above rules.
|
||||
|
||||
The underlying reason for the above rules is to make sure, that a
|
||||
mount can be accurately replicated (e.g. umounting and mounting again)
|
||||
based on the information found in /proc/mounts.
|
||||
|
||||
A simple method of saving options at mount/remount time and showing
|
||||
them is provided with the save_mount_options() and
|
||||
generic_show_options() helper functions. Please note, that using
|
||||
these may have drawbacks. For more info see header comments for these
|
||||
functions in fs/namespace.c.
|
||||
|
||||
Resources
|
||||
=========
|
||||
|
|
|
@ -32,7 +32,7 @@ The exact capabilities of GPIOs vary between systems. Common options:
|
|||
- Input values are likewise readable (1, 0). Some chips support readback
|
||||
of pins configured as "output", which is very useful in such "wire-OR"
|
||||
cases (to support bidirectional signaling). GPIO controllers may have
|
||||
input de-glitch logic, sometimes with software controls.
|
||||
input de-glitch/debounce logic, sometimes with software controls.
|
||||
|
||||
- Inputs can often be used as IRQ signals, often edge triggered but
|
||||
sometimes level triggered. Such IRQs may be configurable as system
|
||||
|
@ -60,10 +60,13 @@ used on a board that's wired differently. Only least-common-denominator
|
|||
functionality can be very portable. Other features are platform-specific,
|
||||
and that can be critical for glue logic.
|
||||
|
||||
Plus, this doesn't define an implementation framework, just an interface.
|
||||
Plus, this doesn't require any implementation framework, just an interface.
|
||||
One platform might implement it as simple inline functions accessing chip
|
||||
registers; another might implement it by delegating through abstractions
|
||||
used for several very different kinds of GPIO controller.
|
||||
used for several very different kinds of GPIO controller. (There is some
|
||||
optional code supporting such an implementation strategy, described later
|
||||
in this document, but drivers acting as clients to the GPIO interface must
|
||||
not care how it's implemented.)
|
||||
|
||||
That said, if the convention is supported on their platform, drivers should
|
||||
use it when possible. Platforms should declare GENERIC_GPIO support in
|
||||
|
@ -121,6 +124,11 @@ before tasking is enabled, as part of early board setup.
|
|||
For output GPIOs, the value provided becomes the initial output value.
|
||||
This helps avoid signal glitching during system startup.
|
||||
|
||||
For compatibility with legacy interfaces to GPIOs, setting the direction
|
||||
of a GPIO implicitly requests that GPIO (see below) if it has not been
|
||||
requested already. That compatibility may be removed in the future;
|
||||
explicitly requesting GPIOs is strongly preferred.
|
||||
|
||||
Setting the direction can fail if the GPIO number is invalid, or when
|
||||
that particular GPIO can't be used in that mode. It's generally a bad
|
||||
idea to rely on boot firmware to have set the direction correctly, since
|
||||
|
@ -133,6 +141,7 @@ Spinlock-Safe GPIO access
|
|||
-------------------------
|
||||
Most GPIO controllers can be accessed with memory read/write instructions.
|
||||
That doesn't need to sleep, and can safely be done from inside IRQ handlers.
|
||||
(That includes hardirq contexts on RT kernels.)
|
||||
|
||||
Use these calls to access such GPIOs:
|
||||
|
||||
|
@ -145,7 +154,7 @@ Use these calls to access such GPIOs:
|
|||
The values are boolean, zero for low, nonzero for high. When reading the
|
||||
value of an output pin, the value returned should be what's seen on the
|
||||
pin ... that won't always match the specified output value, because of
|
||||
issues including wire-OR and output latencies.
|
||||
issues including open-drain signaling and output latencies.
|
||||
|
||||
The get/set calls have no error returns because "invalid GPIO" should have
|
||||
been reported earlier from gpio_direction_*(). However, note that not all
|
||||
|
@ -170,7 +179,8 @@ get to the head of a queue to transmit a command and get its response.
|
|||
This requires sleeping, which can't be done from inside IRQ handlers.
|
||||
|
||||
Platforms that support this type of GPIO distinguish them from other GPIOs
|
||||
by returning nonzero from this call:
|
||||
by returning nonzero from this call (which requires a valid GPIO number,
|
||||
either explicitly or implicitly requested):
|
||||
|
||||
int gpio_cansleep(unsigned gpio);
|
||||
|
||||
|
@ -209,8 +219,11 @@ before tasking is enabled, as part of early board setup.
|
|||
These calls serve two basic purposes. One is marking the signals which
|
||||
are actually in use as GPIOs, for better diagnostics; systems may have
|
||||
several hundred potential GPIOs, but often only a dozen are used on any
|
||||
given board. Another is to catch conflicts between drivers, reporting
|
||||
errors when drivers wrongly think they have exclusive use of that signal.
|
||||
given board. Another is to catch conflicts, identifying errors when
|
||||
(a) two or more drivers wrongly think they have exclusive use of that
|
||||
signal, or (b) something wrongly believes it's safe to remove drivers
|
||||
needed to manage a signal that's in active use. That is, requesting a
|
||||
GPIO can serve as a kind of lock.
|
||||
|
||||
These two calls are optional because not not all current Linux platforms
|
||||
offer such functionality in their GPIO support; a valid implementation
|
||||
|
@ -223,6 +236,9 @@ Note that requesting a GPIO does NOT cause it to be configured in any
|
|||
way; it just marks that GPIO as in use. Separate code must handle any
|
||||
pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
|
||||
|
||||
Also note that it's your responsibility to have stopped using a GPIO
|
||||
before you free it.
|
||||
|
||||
|
||||
GPIOs mapped to IRQs
|
||||
--------------------
|
||||
|
@ -238,7 +254,7 @@ map between them using calls like:
|
|||
|
||||
Those return either the corresponding number in the other namespace, or
|
||||
else a negative errno code if the mapping can't be done. (For example,
|
||||
some GPIOs can't used as IRQs.) It is an unchecked error to use a GPIO
|
||||
some GPIOs can't be used as IRQs.) It is an unchecked error to use a GPIO
|
||||
number that wasn't set up as an input using gpio_direction_input(), or
|
||||
to use an IRQ number that didn't originally come from gpio_to_irq().
|
||||
|
||||
|
@ -299,17 +315,110 @@ Related to multiplexing is configuration and enabling of the pullups or
|
|||
pulldowns integrated on some platforms. Not all platforms support them,
|
||||
or support them in the same way; and any given board might use external
|
||||
pullups (or pulldowns) so that the on-chip ones should not be used.
|
||||
(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.)
|
||||
|
||||
There are other system-specific mechanisms that are not specified here,
|
||||
like the aforementioned options for input de-glitching and wire-OR output.
|
||||
Hardware may support reading or writing GPIOs in gangs, but that's usually
|
||||
configuration dependent: for GPIOs sharing the same bank. (GPIOs are
|
||||
commonly grouped in banks of 16 or 32, with a given SOC having several such
|
||||
banks.) Some systems can trigger IRQs from output GPIOs. Code relying on
|
||||
such mechanisms will necessarily be nonportable.
|
||||
banks.) Some systems can trigger IRQs from output GPIOs, or read values
|
||||
from pins not managed as GPIOs. Code relying on such mechanisms will
|
||||
necessarily be nonportable.
|
||||
|
||||
Dynamic definition of GPIOs is not currently supported; for example, as
|
||||
Dynamic definition of GPIOs is not currently standard; for example, as
|
||||
a side effect of configuring an add-on board with some GPIO expanders.
|
||||
|
||||
These calls are purely for kernel space, but a userspace API could be built
|
||||
on top of it.
|
||||
on top of them.
|
||||
|
||||
|
||||
GPIO implementor's framework (OPTIONAL)
|
||||
=======================================
|
||||
As noted earlier, there is an optional implementation framework making it
|
||||
easier for platforms to support different kinds of GPIO controller using
|
||||
the same programming interface.
|
||||
|
||||
As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file
|
||||
will be found there. That will list all the controllers registered through
|
||||
this framework, and the state of the GPIOs currently in use.
|
||||
|
||||
|
||||
Controller Drivers: gpio_chip
|
||||
-----------------------------
|
||||
In this framework each GPIO controller is packaged as a "struct gpio_chip"
|
||||
with information common to each controller of that type:
|
||||
|
||||
- methods to establish GPIO direction
|
||||
- methods used to access GPIO values
|
||||
- flag saying whether calls to its methods may sleep
|
||||
- optional debugfs dump method (showing extra state like pullup config)
|
||||
- label for diagnostics
|
||||
|
||||
There is also per-instance data, which may come from device.platform_data:
|
||||
the number of its first GPIO, and how many GPIOs it exposes.
|
||||
|
||||
The code implementing a gpio_chip should support multiple instances of the
|
||||
controller, possibly using the driver model. That code will configure each
|
||||
gpio_chip and issue gpiochip_add(). Removing a GPIO controller should be
|
||||
rare; use gpiochip_remove() when it is unavoidable.
|
||||
|
||||
Most often a gpio_chip is part of an instance-specific structure with state
|
||||
not exposed by the GPIO interfaces, such as addressing, power management,
|
||||
and more. Chips such as codecs will have complex non-GPIO state,
|
||||
|
||||
Any debugfs dump method should normally ignore signals which haven't been
|
||||
requested as GPIOs. They can use gpiochip_is_requested(), which returns
|
||||
either NULL or the label associated with that GPIO when it was requested.
|
||||
|
||||
|
||||
Platform Support
|
||||
----------------
|
||||
To support this framework, a platform's Kconfig will "select HAVE_GPIO_LIB"
|
||||
and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines
|
||||
three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep().
|
||||
They may also want to provide a custom value for ARCH_NR_GPIOS.
|
||||
|
||||
Trivial implementations of those functions can directly use framework
|
||||
code, which always dispatches through the gpio_chip:
|
||||
|
||||
#define gpio_get_value __gpio_get_value
|
||||
#define gpio_set_value __gpio_set_value
|
||||
#define gpio_cansleep __gpio_cansleep
|
||||
|
||||
Fancier implementations could instead define those as inline functions with
|
||||
logic optimizing access to specific SOC-based GPIOs. For example, if the
|
||||
referenced GPIO is the constant "12", getting or setting its value could
|
||||
cost as little as two or three instructions, never sleeping. When such an
|
||||
optimization is not possible those calls must delegate to the framework
|
||||
code, costing at least a few dozen instructions. For bitbanged I/O, such
|
||||
instruction savings can be significant.
|
||||
|
||||
For SOCs, platform-specific code defines and registers gpio_chip instances
|
||||
for each bank of on-chip GPIOs. Those GPIOs should be numbered/labeled to
|
||||
match chip vendor documentation, and directly match board schematics. They
|
||||
may well start at zero and go up to a platform-specific limit. Such GPIOs
|
||||
are normally integrated into platform initialization to make them always be
|
||||
available, from arch_initcall() or earlier; they can often serve as IRQs.
|
||||
|
||||
|
||||
Board Support
|
||||
-------------
|
||||
For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi
|
||||
function devices, FPGAs or CPLDs -- most often board-specific code handles
|
||||
registering controller devices and ensures that their drivers know what GPIO
|
||||
numbers to use with gpiochip_add(). Their numbers often start right after
|
||||
platform-specific GPIOs.
|
||||
|
||||
For example, board setup code could create structures identifying the range
|
||||
of GPIOs that chip will expose, and passes them to each GPIO expander chip
|
||||
using platform_data. Then the chip driver's probe() routine could pass that
|
||||
data to gpiochip_add().
|
||||
|
||||
Initialization order can be important. For example, when a device relies on
|
||||
an I2C-based GPIO, its probe() routine should only be called after that GPIO
|
||||
becomes available. That may mean the device should not be registered until
|
||||
calls for that GPIO can work. One way to address such dependencies is for
|
||||
such gpio_chip controllers to provide setup() and teardown() callbacks to
|
||||
board specific code; those board specific callbacks would register devices
|
||||
once all the necessary resources are available.
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
Kernel driver ads7828
|
||||
=====================
|
||||
|
||||
Supported chips:
|
||||
* Texas Instruments/Burr-Brown ADS7828
|
||||
Prefix: 'ads7828'
|
||||
Addresses scanned: I2C 0x48, 0x49, 0x4a, 0x4b
|
||||
Datasheet: Publicly available at the Texas Instruments website :
|
||||
http://focus.ti.com/lit/ds/symlink/ads7828.pdf
|
||||
|
||||
Authors:
|
||||
Steve Hardy <steve@linuxrealtime.co.uk>
|
||||
|
||||
Module Parameters
|
||||
-----------------
|
||||
|
||||
* se_input: bool (default Y)
|
||||
Single ended operation - set to N for differential mode
|
||||
* int_vref: bool (default Y)
|
||||
Operate with the internal 2.5V reference - set to N for external reference
|
||||
* vref_mv: int (default 2500)
|
||||
If using an external reference, set this to the reference voltage in mV
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the Texas Instruments ADS7828.
|
||||
|
||||
This device is a 12-bit 8-channel A-D converter.
|
||||
|
||||
It can operate in single ended mode (8 +ve inputs) or in differential mode,
|
||||
where 4 differential pairs can be measured.
|
||||
|
||||
The chip also has the facility to use an external voltage reference. This
|
||||
may be required if your hardware supplies the ADS7828 from a 5V supply, see
|
||||
the datasheet for more details.
|
|
@ -30,7 +30,7 @@ Supported chips:
|
|||
Datasheet: No longer be available
|
||||
|
||||
Authors:
|
||||
Christophe Gauthron <chrisg@0-in.com>
|
||||
Christophe Gauthron
|
||||
Jean Delvare <khali@linux-fr.org>
|
||||
|
||||
|
||||
|
|
|
@ -4,12 +4,12 @@ Kernel driver lm78
|
|||
Supported chips:
|
||||
* National Semiconductor LM78 / LM78-J
|
||||
Prefix: 'lm78'
|
||||
Addresses scanned: I2C 0x20 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Datasheet: Publicly available at the National Semiconductor website
|
||||
http://www.national.com/
|
||||
* National Semiconductor LM79
|
||||
Prefix: 'lm79'
|
||||
Addresses scanned: I2C 0x20 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Datasheet: Publicly available at the National Semiconductor website
|
||||
http://www.national.com/
|
||||
|
||||
|
|
|
@ -4,8 +4,12 @@ Kernel driver lm87
|
|||
Supported chips:
|
||||
* National Semiconductor LM87
|
||||
Prefix: 'lm87'
|
||||
Addresses scanned: I2C 0x2c - 0x2f
|
||||
Addresses scanned: I2C 0x2c - 0x2e
|
||||
Datasheet: http://www.national.com/pf/LM/LM87.html
|
||||
* Analog Devices ADM1024
|
||||
Prefix: 'adm1024'
|
||||
Addresses scanned: I2C 0x2c - 0x2e
|
||||
Datasheet: http://www.analog.com/en/prod/0,2877,ADM1024,00.html
|
||||
|
||||
Authors:
|
||||
Frodo Looijaard <frodol@dds.nl>,
|
||||
|
@ -19,11 +23,12 @@ Authors:
|
|||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the National Semiconductor LM87.
|
||||
This driver implements support for the National Semiconductor LM87
|
||||
and the Analog Devices ADM1024.
|
||||
|
||||
The LM87 implements up to three temperature sensors, up to two fan
|
||||
rotation speed sensors, up to seven voltage sensors, alarms, and some
|
||||
miscellaneous stuff.
|
||||
miscellaneous stuff. The ADM1024 is fully compatible.
|
||||
|
||||
Temperatures are measured in degrees Celsius. Each input has a high
|
||||
and low alarm settings. A high limit produces an alarm when the value
|
||||
|
|
|
@ -14,7 +14,7 @@ Lm-sensors
|
|||
|
||||
Core set of utilities that will allow you to obtain health information,
|
||||
setup monitoring limits etc. You can get them on their homepage
|
||||
http://www.lm-sensors.nu/ or as a package from your Linux distribution.
|
||||
http://www.lm-sensors.org/ or as a package from your Linux distribution.
|
||||
|
||||
If from website:
|
||||
Get lm-sensors from project web site. Please note, you need only userspace
|
||||
|
|
|
@ -23,8 +23,9 @@ W83627DHG super I/O chips. We will refer to them collectively as Winbond chips.
|
|||
|
||||
The chips implement three temperature sensors, five fan rotation
|
||||
speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
|
||||
VID (6 pins), alarms with beep warnings (control unimplemented), and
|
||||
some automatic fan regulation strategies (plus manual fan control mode).
|
||||
VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep
|
||||
warnings (control unimplemented), and some automatic fan regulation
|
||||
strategies (plus manual fan control mode).
|
||||
|
||||
Temperatures are measured in degrees Celsius and measurement resolution is 1
|
||||
degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
|
||||
|
|
|
@ -73,5 +73,4 @@ doesn't help, you may just ignore the bogus VID reading with no harm done.
|
|||
|
||||
For further information on this driver see the w83781d driver documentation.
|
||||
|
||||
[1] http://www2.lm-sensors.nu/~lm78/cvs/browse.cgi/lm_sensors2/doc/vid
|
||||
|
||||
[1] http://www.lm-sensors.org/browser/lm-sensors/trunk/doc/vid
|
||||
|
|
|
@ -4,20 +4,16 @@ Kernel driver w83781d
|
|||
Supported chips:
|
||||
* Winbond W83781D
|
||||
Prefix: 'w83781d'
|
||||
Addresses scanned: I2C 0x20 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/w83781d.pdf
|
||||
* Winbond W83782D
|
||||
Prefix: 'w83782d'
|
||||
Addresses scanned: I2C 0x20 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Datasheet: http://www.winbond.com/PDF/sheet/w83782d.pdf
|
||||
* Winbond W83783S
|
||||
Prefix: 'w83783s'
|
||||
Addresses scanned: I2C 0x2d
|
||||
Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/w83783s.pdf
|
||||
* Winbond W83627HF
|
||||
Prefix: 'w83627hf'
|
||||
Addresses scanned: I2C 0x20 - 0x2f, ISA 0x290 (8 I/O ports)
|
||||
Datasheet: http://www.winbond.com/PDF/sheet/w83627hf.pdf
|
||||
* Asus AS99127F
|
||||
Prefix: 'as99127f'
|
||||
Addresses scanned: I2C 0x28 - 0x2f
|
||||
|
@ -50,20 +46,18 @@ force_subclients=bus,caddr,saddr,saddr
|
|||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the Winbond W83781D, W83782D, W83783S,
|
||||
W83627HF chips, and the Asus AS99127F chips. We will refer to them
|
||||
collectively as W8378* chips.
|
||||
This driver implements support for the Winbond W83781D, W83782D, W83783S
|
||||
chips, and the Asus AS99127F chips. We will refer to them collectively as
|
||||
W8378* chips.
|
||||
|
||||
There is quite some difference between these chips, but they are similar
|
||||
enough that it was sensible to put them together in one driver.
|
||||
The W83627HF chip is assumed to be identical to the ISA W83782D.
|
||||
The Asus chips are similar to an I2C-only W83782D.
|
||||
|
||||
Chip #vin #fanin #pwm #temp wchipid vendid i2c ISA
|
||||
as99127f 7 3 0 3 0x31 0x12c3 yes no
|
||||
as99127f rev.2 (type_name = as99127f) 0x31 0x5ca3 yes no
|
||||
w83781d 7 3 0 3 0x10-1 0x5ca3 yes yes
|
||||
w83627hf 9 3 2 3 0x21 0x5ca3 yes yes(LPC)
|
||||
w83782d 9 3 2-4 3 0x30 0x5ca3 yes yes
|
||||
w83783s 5-6 3 2 1-2 0x40 0x5ca3 yes no
|
||||
|
||||
|
@ -143,9 +137,9 @@ Individual alarm and beep bits:
|
|||
0x000400: in6
|
||||
0x000800: fan3
|
||||
0x001000: chassis
|
||||
0x002000: temp3 (W83782D and W83627HF only)
|
||||
0x010000: in7 (W83782D and W83627HF only)
|
||||
0x020000: in8 (W83782D and W83627HF only)
|
||||
0x002000: temp3 (W83782D only)
|
||||
0x010000: in7 (W83782D only)
|
||||
0x020000: in8 (W83782D only)
|
||||
|
||||
If an alarm triggers, it will remain triggered until the hardware register
|
||||
is read at least once. This means that the cause for the alarm may
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
Kernel driver w83l786ng
|
||||
=====================
|
||||
|
||||
Supported chips:
|
||||
* Winbond W83L786NG/W83L786NR
|
||||
Prefix: 'w83l786ng'
|
||||
Addresses scanned: I2C 0x2e - 0x2f
|
||||
Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83L786NRNG09.pdf
|
||||
|
||||
Author: Kevin Lo <kevlo@kevlo.org>
|
||||
|
||||
|
||||
Module Parameters
|
||||
-----------------
|
||||
|
||||
* reset boolean
|
||||
(default 0)
|
||||
Use 'reset=1' to reset the chip (via index 0x40, bit 7). The default
|
||||
behavior is no chip reset to preserve BIOS settings
|
||||
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for Winbond W83L786NG/W83L786NR chips.
|
||||
|
||||
The driver implements two temperature sensors, two fan rotation speed
|
||||
sensors, and three voltage sensors.
|
||||
|
||||
Temperatures are measured in degrees Celsius and measurement resolution is 1
|
||||
degC for temp1 and temp2.
|
||||
|
||||
Fan rotation speeds are reported in RPM (rotations per minute). Fan readings
|
||||
readings can be divided by a programmable divider (1, 2, 4, 8, 16, 32, 64
|
||||
or 128 for fan 1/2) to give the readings more range or accuracy.
|
||||
|
||||
Voltage sensors (also known as IN sensors) report their values in millivolts.
|
||||
An alarm is triggered if the voltage has crossed a programmable minimum
|
||||
or maximum limit.
|
||||
|
||||
/sys files
|
||||
----------
|
||||
|
||||
pwm[1-2] - this file stores PWM duty cycle or DC value (fan speed) in range:
|
||||
0 (stop) to 255 (full)
|
||||
pwm[1-2]_enable - this file controls mode of fan/temperature control:
|
||||
* 0 Manual Mode
|
||||
* 1 Thermal Cruise
|
||||
* 2 Smart Fan II
|
||||
* 4 FAN_SET
|
||||
pwm[1-2]_mode - Select PWM of DC mode
|
||||
* 0 DC
|
||||
* 1 PWM
|
||||
tolerance[1-2] - Value in degrees of Celsius (degC) for +- T
|
|
@ -95,4 +95,4 @@ of all affected systems, so the only safe solution was to prevent access to
|
|||
the SMBus on all IBM systems (detected using DMI data.)
|
||||
|
||||
For additional information, read:
|
||||
http://www2.lm-sensors.nu/~lm78/cvs/lm_sensors2/README.thinkpad
|
||||
http://www.lm-sensors.org/browser/lm-sensors/trunk/README.thinkpad
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
Kernel driver pca9539
|
||||
=====================
|
||||
|
||||
NOTE: this driver is deprecated and will be dropped soon, use
|
||||
drivers/gpio/pca9539.c instead.
|
||||
|
||||
Supported chips:
|
||||
* Philips PCA9539
|
||||
Prefix: 'pca9539'
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <fcntl.h>
|
||||
#include <fnmatch.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
@ -65,7 +66,7 @@ int scan_tree(char *path, char *file, off_t offset, size_t length, int touch)
|
|||
{
|
||||
struct dirent **namelist;
|
||||
char *name, *path2;
|
||||
int i, n, r, rc, result = 0;
|
||||
int i, n, r, rc = 0, result = 0;
|
||||
struct stat buf;
|
||||
|
||||
n = scandir(path, &namelist, 0, alphasort);
|
||||
|
@ -113,7 +114,7 @@ skip:
|
|||
free(namelist[i]);
|
||||
}
|
||||
free(namelist);
|
||||
return rc;
|
||||
return result;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
|
@ -149,7 +150,7 @@ int scan_rom(char *path, char *file)
|
|||
{
|
||||
struct dirent **namelist;
|
||||
char *name, *path2;
|
||||
int i, n, r, rc, result = 0;
|
||||
int i, n, r, rc = 0, result = 0;
|
||||
struct stat buf;
|
||||
|
||||
n = scandir(path, &namelist, 0, alphasort);
|
||||
|
@ -180,7 +181,7 @@ int scan_rom(char *path, char *file)
|
|||
* important thing is that no MCA happened.
|
||||
*/
|
||||
if (rc > 0)
|
||||
fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc);
|
||||
fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc);
|
||||
else {
|
||||
fprintf(stderr, "PASS: %s not readable\n", path2);
|
||||
return rc;
|
||||
|
@ -201,10 +202,10 @@ skip:
|
|||
free(namelist[i]);
|
||||
}
|
||||
free(namelist);
|
||||
return rc;
|
||||
return result;
|
||||
}
|
||||
|
||||
int main()
|
||||
int main(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
@ -256,4 +257,6 @@ int main()
|
|||
scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0);
|
||||
scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1);
|
||||
scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ static struct input_dev *button_dev;
|
|||
|
||||
static void button_interrupt(int irq, void *dummy, struct pt_regs *fp)
|
||||
{
|
||||
input_report_key(button_dev, BTN_1, inb(BUTTON_PORT) & 1);
|
||||
input_report_key(button_dev, BTN_0, inb(BUTTON_PORT) & 1);
|
||||
input_sync(button_dev);
|
||||
}
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ they should not wrap twice before you notice them.
|
|||
Each set of stats only applies to the indicated device; if you want
|
||||
system-wide stats you'll have to find all the devices and sum them all up.
|
||||
|
||||
Field 1 -- # of reads issued
|
||||
Field 1 -- # of reads completed
|
||||
This is the total number of reads completed successfully.
|
||||
Field 2 -- # of reads merged, field 6 -- # of writes merged
|
||||
Reads and writes which are adjacent to each other may be merged for
|
||||
|
@ -132,6 +132,19 @@ words, the number of reads for partitions is counted slightly before time
|
|||
of queuing for partitions, and at completion for whole disks. This is
|
||||
a subtle distinction that is probably uninteresting for most cases.
|
||||
|
||||
More significant is the error induced by counting the numbers of
|
||||
reads/writes before merges for partitions and after for disks. Since a
|
||||
typical workload usually contains a lot of successive and adjacent requests,
|
||||
the number of reads/writes issued can be several times higher than the
|
||||
number of reads/writes completed.
|
||||
|
||||
In 2.6.25, the full statistic set is again available for partitions and
|
||||
disk and partition statistics are consistent again. Since we still don't
|
||||
keep record of the partition-relative address, an operation is attributed to
|
||||
the partition which contains the first sector of the request after the
|
||||
eventual merges. As requests can be merged across partition, this could lead
|
||||
to some (probably insignificant) innacuracy.
|
||||
|
||||
Additional notes
|
||||
----------------
|
||||
|
||||
|
|
|
@ -147,8 +147,10 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||
default: 0
|
||||
|
||||
acpi_sleep= [HW,ACPI] Sleep options
|
||||
Format: { s3_bios, s3_mode }
|
||||
See Documentation/power/video.txt
|
||||
Format: { s3_bios, s3_mode, s3_beep }
|
||||
See Documentation/power/video.txt for s3_bios and s3_mode.
|
||||
s3_beep is for debugging; it makes the PC's speaker beep
|
||||
as soon as the kernel's real-mode entry point is called.
|
||||
|
||||
acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
|
||||
Format: { level | edge | high | low }
|
||||
|
@ -175,6 +177,9 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||
|
||||
acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
|
||||
|
||||
acpi_no_initrd_override [KNL,ACPI]
|
||||
Disable loading custom ACPI tables from the initramfs
|
||||
|
||||
acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
|
||||
Format: To spoof as Windows 98: ="Microsoft Windows"
|
||||
|
||||
|
@ -780,6 +785,9 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||
loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
|
||||
as idle=poll.
|
||||
|
||||
ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
|
||||
Claim all unknown PCI IDE storage controllers.
|
||||
|
||||
ignore_loglevel [KNL]
|
||||
Ignore loglevel setting - this will print /all/
|
||||
kernel messages to the console. Useful for debugging.
|
||||
|
@ -1965,9 +1973,6 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||
<deci-seconds>: poll all this frequency
|
||||
0: no polling (default)
|
||||
|
||||
time Show timing data prefixed to each printk message line
|
||||
[deprecated, see 'printk.time']
|
||||
|
||||
tipar.timeout= [HW,PPT]
|
||||
Set communications timeout in tenths of a second
|
||||
(default 15).
|
||||
|
|
|
@ -92,11 +92,12 @@ handler has run. Up to MAX_STACK_SIZE bytes are copied -- e.g.,
|
|||
64 bytes on i386.
|
||||
|
||||
Note that the probed function's args may be passed on the stack
|
||||
or in registers (e.g., for x86_64 or for an i386 fastcall function).
|
||||
The jprobe will work in either case, so long as the handler's
|
||||
prototype matches that of the probed function.
|
||||
or in registers. The jprobe will work in either case, so long as the
|
||||
handler's prototype matches that of the probed function.
|
||||
|
||||
1.3 How Does a Return Probe Work?
|
||||
1.3 Return Probes
|
||||
|
||||
1.3.1 How Does a Return Probe Work?
|
||||
|
||||
When you call register_kretprobe(), Kprobes establishes a kprobe at
|
||||
the entry to the function. When the probed function is called and this
|
||||
|
@ -107,9 +108,9 @@ At boot time, Kprobes registers a kprobe at the trampoline.
|
|||
|
||||
When the probed function executes its return instruction, control
|
||||
passes to the trampoline and that probe is hit. Kprobes' trampoline
|
||||
handler calls the user-specified handler associated with the kretprobe,
|
||||
then sets the saved instruction pointer to the saved return address,
|
||||
and that's where execution resumes upon return from the trap.
|
||||
handler calls the user-specified return handler associated with the
|
||||
kretprobe, then sets the saved instruction pointer to the saved return
|
||||
address, and that's where execution resumes upon return from the trap.
|
||||
|
||||
While the probed function is executing, its return address is
|
||||
stored in an object of type kretprobe_instance. Before calling
|
||||
|
@ -131,6 +132,30 @@ zero when the return probe is registered, and is incremented every
|
|||
time the probed function is entered but there is no kretprobe_instance
|
||||
object available for establishing the return probe.
|
||||
|
||||
1.3.2 Kretprobe entry-handler
|
||||
|
||||
Kretprobes also provides an optional user-specified handler which runs
|
||||
on function entry. This handler is specified by setting the entry_handler
|
||||
field of the kretprobe struct. Whenever the kprobe placed by kretprobe at the
|
||||
function entry is hit, the user-defined entry_handler, if any, is invoked.
|
||||
If the entry_handler returns 0 (success) then a corresponding return handler
|
||||
is guaranteed to be called upon function return. If the entry_handler
|
||||
returns a non-zero error then Kprobes leaves the return address as is, and
|
||||
the kretprobe has no further effect for that particular function instance.
|
||||
|
||||
Multiple entry and return handler invocations are matched using the unique
|
||||
kretprobe_instance object associated with them. Additionally, a user
|
||||
may also specify per return-instance private data to be part of each
|
||||
kretprobe_instance object. This is especially useful when sharing private
|
||||
data between corresponding user entry and return handlers. The size of each
|
||||
private data object can be specified at kretprobe registration time by
|
||||
setting the data_size field of the kretprobe struct. This data can be
|
||||
accessed through the data field of each kretprobe_instance object.
|
||||
|
||||
In case probed function is entered but there is no kretprobe_instance
|
||||
object available, then in addition to incrementing the nmissed count,
|
||||
the user entry_handler invocation is also skipped.
|
||||
|
||||
2. Architectures Supported
|
||||
|
||||
Kprobes, jprobes, and return probes are implemented on the following
|
||||
|
@ -244,9 +269,9 @@ Kprobes runs the handler whose address is jp->entry.
|
|||
The handler should have the same arg list and return type as the probed
|
||||
function; and just before it returns, it must call jprobe_return().
|
||||
(The handler never actually returns, since jprobe_return() returns
|
||||
control to Kprobes.) If the probed function is declared asmlinkage,
|
||||
fastcall, or anything else that affects how args are passed, the
|
||||
handler's declaration must match.
|
||||
control to Kprobes.) If the probed function is declared asmlinkage
|
||||
or anything else that affects how args are passed, the handler's
|
||||
declaration must match.
|
||||
|
||||
register_jprobe() returns 0 on success, or a negative errno otherwise.
|
||||
|
||||
|
@ -274,6 +299,8 @@ of interest:
|
|||
- ret_addr: the return address
|
||||
- rp: points to the corresponding kretprobe object
|
||||
- task: points to the corresponding task struct
|
||||
- data: points to per return-instance private data; see "Kretprobe
|
||||
entry-handler" for details.
|
||||
|
||||
The regs_return_value(regs) macro provides a simple abstraction to
|
||||
extract the return value from the appropriate register as defined by
|
||||
|
@ -556,23 +583,52 @@ report failed calls to sys_open().
|
|||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ktime.h>
|
||||
|
||||
/* per-instance private data */
|
||||
struct my_data {
|
||||
ktime_t entry_stamp;
|
||||
};
|
||||
|
||||
static const char *probed_func = "sys_open";
|
||||
|
||||
/* Return-probe handler: If the probed function fails, log the return value. */
|
||||
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
|
||||
/* Timestamp function entry. */
|
||||
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
|
||||
{
|
||||
struct my_data *data;
|
||||
|
||||
if(!current->mm)
|
||||
return 1; /* skip kernel threads */
|
||||
|
||||
data = (struct my_data *)ri->data;
|
||||
data->entry_stamp = ktime_get();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* If the probed function failed, log the return value and duration.
|
||||
* Duration may turn out to be zero consistently, depending upon the
|
||||
* granularity of time accounting on the platform. */
|
||||
static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
|
||||
{
|
||||
int retval = regs_return_value(regs);
|
||||
struct my_data *data = (struct my_data *)ri->data;
|
||||
s64 delta;
|
||||
ktime_t now;
|
||||
|
||||
if (retval < 0) {
|
||||
printk("%s returns %d\n", probed_func, retval);
|
||||
now = ktime_get();
|
||||
delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
|
||||
printk("%s: return val = %d (duration = %lld ns)\n",
|
||||
probed_func, retval, delta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct kretprobe my_kretprobe = {
|
||||
.handler = ret_handler,
|
||||
/* Probe up to 20 instances concurrently. */
|
||||
.maxactive = 20
|
||||
.handler = return_handler,
|
||||
.entry_handler = entry_handler,
|
||||
.data_size = sizeof(struct my_data),
|
||||
.maxactive = 20, /* probe up to 20 instances concurrently */
|
||||
};
|
||||
|
||||
static int __init kretprobe_init(void)
|
||||
|
@ -584,7 +640,7 @@ static int __init kretprobe_init(void)
|
|||
printk("register_kretprobe failed, returned %d\n", ret);
|
||||
return -1;
|
||||
}
|
||||
printk("Planted return probe at %p\n", my_kretprobe.kp.addr);
|
||||
printk("Kretprobe active on %s\n", my_kretprobe.kp.symbol_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -594,7 +650,7 @@ static void __exit kretprobe_exit(void)
|
|||
printk("kretprobe unregistered\n");
|
||||
/* nmissed > 0 suggests that maxactive was set too low. */
|
||||
printk("Missed probing %d instances of %s\n",
|
||||
my_kretprobe.nmissed, probed_func);
|
||||
my_kretprobe.nmissed, probed_func);
|
||||
}
|
||||
|
||||
module_init(kretprobe_init)
|
||||
|
|
|
@ -141,10 +141,10 @@ The last rule (rule 3) is the nastiest one to handle. Say, for
|
|||
instance, you have a list of items that are each kref-ed, and you wish
|
||||
to get the first one. You can't just pull the first item off the list
|
||||
and kref_get() it. That violates rule 3 because you are not already
|
||||
holding a valid pointer. You must add locks or semaphores. For
|
||||
instance:
|
||||
holding a valid pointer. You must add a mutex (or some other lock).
|
||||
For instance:
|
||||
|
||||
static DECLARE_MUTEX(sem);
|
||||
static DEFINE_MUTEX(mutex);
|
||||
static LIST_HEAD(q);
|
||||
struct my_data
|
||||
{
|
||||
|
@ -155,12 +155,12 @@ struct my_data
|
|||
static struct my_data *get_entry()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
down(&sem);
|
||||
mutex_lock(&mutex);
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_q_entry, link);
|
||||
kref_get(&entry->refcount);
|
||||
}
|
||||
up(&sem);
|
||||
mutex_unlock(&mutex);
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
@ -174,9 +174,9 @@ static void release_entry(struct kref *ref)
|
|||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
down(&sem);
|
||||
mutex_lock(&mutex);
|
||||
kref_put(&entry->refcount, release_entry);
|
||||
up(&sem);
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
The kref_put() return value is useful if you do not want to hold the
|
||||
|
@ -191,13 +191,13 @@ static void release_entry(struct kref *ref)
|
|||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
down(&sem);
|
||||
mutex_lock(&mutex);
|
||||
if (kref_put(&entry->refcount, release_entry)) {
|
||||
list_del(&entry->link);
|
||||
up(&sem);
|
||||
mutex_unlock(&mutex);
|
||||
kfree(entry);
|
||||
} else
|
||||
up(&sem);
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
This is really more useful if you have to call other routines as part
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
00-INDEX
|
||||
- This file
|
||||
acer-wmi.txt
|
||||
- information on the Acer Laptop WMI Extras driver.
|
||||
sony-laptop.txt
|
||||
- Sony Notebook Control Driver (SNC) Readme.
|
||||
sonypi.txt
|
||||
- info on Linux Sony Programmable I/O Device support.
|
||||
thinkpad-acpi.txt
|
||||
- information on the (IBM and Lenovo) ThinkPad ACPI Extras driver.
|
|
@ -0,0 +1,202 @@
|
|||
Acer Laptop WMI Extras Driver
|
||||
http://code.google.com/p/aceracpi
|
||||
Version 0.1
|
||||
9th February 2008
|
||||
|
||||
Copyright 2007-2008 Carlos Corbacho <carlos@strangeworlds.co.uk>
|
||||
|
||||
acer-wmi is a driver to allow you to control various parts of your Acer laptop
|
||||
hardware under Linux which are exposed via ACPI-WMI.
|
||||
|
||||
This driver completely replaces the old out-of-tree acer_acpi, which I am
|
||||
currently maintaining for bug fixes only on pre-2.6.25 kernels. All development
|
||||
work is now focused solely on acer-wmi.
|
||||
|
||||
Disclaimer
|
||||
**********
|
||||
|
||||
Acer and Wistron have provided nothing towards the development acer_acpi or
|
||||
acer-wmi. All information we have has been through the efforts of the developers
|
||||
and the users to discover as much as possible about the hardware.
|
||||
|
||||
As such, I do warn that this could break your hardware - this is extremely
|
||||
unlikely of course, but please bear this in mind.
|
||||
|
||||
Background
|
||||
**********
|
||||
|
||||
acer-wmi is derived from acer_acpi, originally developed by Mark
|
||||
Smith in 2005, then taken over by Carlos Corbacho in 2007, in order to activate
|
||||
the wireless LAN card under a 64-bit version of Linux, as acerhk[1] (the
|
||||
previous solution to the problem) relied on making 32 bit BIOS calls which are
|
||||
not possible in kernel space from a 64 bit OS.
|
||||
|
||||
[1] acerhk: http://www.cakey.de/acerhk/
|
||||
|
||||
Supported Hardware
|
||||
******************
|
||||
|
||||
Please see the website for the current list of known working hardare:
|
||||
|
||||
http://code.google.com/p/aceracpi/wiki/SupportedHardware
|
||||
|
||||
If your laptop is not listed, or listed as unknown, and works with acer-wmi,
|
||||
please contact me with a copy of the DSDT.
|
||||
|
||||
If your Acer laptop doesn't work with acer-wmi, I would also like to see the
|
||||
DSDT.
|
||||
|
||||
To send me the DSDT, as root/sudo:
|
||||
|
||||
cat /sys/firmware/acpi/DSDT > dsdt
|
||||
|
||||
And send me the resulting 'dsdt' file.
|
||||
|
||||
Usage
|
||||
*****
|
||||
|
||||
On Acer laptops, acer-wmi should already be autoloaded based on DMI matching.
|
||||
For non-Acer laptops, until WMI based autoloading support is added, you will
|
||||
need to manually load acer-wmi.
|
||||
|
||||
acer-wmi creates /sys/devices/platform/acer-wmi, and fills it with various
|
||||
files whose usage is detailed below, which enables you to control some of the
|
||||
following (varies between models):
|
||||
|
||||
* the wireless LAN card radio
|
||||
* inbuilt Bluetooth adapter
|
||||
* inbuilt 3G card
|
||||
* mail LED of your laptop
|
||||
* brightness of the LCD panel
|
||||
|
||||
Wireless
|
||||
********
|
||||
|
||||
With regards to wireless, all acer-wmi does is enable the radio on the card. It
|
||||
is not responsible for the wireless LED - once the radio is enabled, this is
|
||||
down to the wireless driver for your card. So the behaviour of the wireless LED,
|
||||
once you enable the radio, will depend on your hardware and driver combination.
|
||||
|
||||
e.g. With the BCM4318 on the Acer Aspire 5020 series:
|
||||
|
||||
ndiswrapper: Light blinks on when transmitting
|
||||
bcm43xx/b43: Solid light, blinks off when transmitting
|
||||
|
||||
Wireless radio control is unconditionally enabled - all Acer laptops that support
|
||||
acer-wmi come with built-in wireless. However, should you feel so inclined to
|
||||
ever wish to remove the card, or swap it out at some point, please get in touch
|
||||
with me, as we may well be able to gain some data on wireless card detection.
|
||||
|
||||
To read the status of the wireless radio (0=off, 1=on):
|
||||
cat /sys/devices/platform/acer-wmi/wireless
|
||||
|
||||
To enable the wireless radio:
|
||||
echo 1 > /sys/devices/platform/acer-wmi/wireless
|
||||
|
||||
To disable the wireless radio:
|
||||
echo 0 > /sys/devices/platform/acer-wmi/wireless
|
||||
|
||||
To set the state of the wireless radio when loading acer-wmi, pass:
|
||||
wireless=X (where X is 0 or 1)
|
||||
|
||||
Bluetooth
|
||||
*********
|
||||
|
||||
For bluetooth, this is an internal USB dongle, so once enabled, you will get
|
||||
a USB device connection event, and a new USB device appears. When you disable
|
||||
bluetooth, you get the reverse - a USB device disconnect event, followed by the
|
||||
device disappearing again.
|
||||
|
||||
Bluetooth is autodetected by acer-wmi, so if you do not have a bluetooth module
|
||||
installed in your laptop, this file won't exist (please be aware that it is
|
||||
quite common for Acer not to fit bluetooth to their laptops - so just because
|
||||
you have a bluetooth button on the laptop, doesn't mean that bluetooth is
|
||||
installed).
|
||||
|
||||
For the adventurously minded - if you want to buy an internal bluetooth
|
||||
module off the internet that is compatible with your laptop and fit it, then
|
||||
it will work just fine with acer-wmi.
|
||||
|
||||
To read the status of the bluetooth module (0=off, 1=on):
|
||||
cat /sys/devices/platform/acer-wmi/wireless
|
||||
|
||||
To enable the bluetooth module:
|
||||
echo 1 > /sys/devices/platform/acer-wmi/bluetooth
|
||||
|
||||
To disable the bluetooth module:
|
||||
echo 0 > /sys/devices/platform/acer-wmi/bluetooth
|
||||
|
||||
To set the state of the bluetooth module when loading acer-wmi, pass:
|
||||
bluetooth=X (where X is 0 or 1)
|
||||
|
||||
3G
|
||||
**
|
||||
|
||||
3G is currently not autodetected, so the 'threeg' file is always created under
|
||||
sysfs. So far, no-one in possession of an Acer laptop with 3G built-in appears to
|
||||
have tried Linux, or reported back, so we don't have any information on this.
|
||||
|
||||
If you have an Acer laptop that does have a 3G card in, please contact me so we
|
||||
can properly detect these, and find out a bit more about them.
|
||||
|
||||
To read the status of the 3G card (0=off, 1=on):
|
||||
cat /sys/devices/platform/acer-wmi/threeg
|
||||
|
||||
To enable the 3G card:
|
||||
echo 1 > /sys/devices/platform/acer-wmi/threeg
|
||||
|
||||
To disable the 3G card:
|
||||
echo 0 > /sys/devices/platform/acer-wmi/threeg
|
||||
|
||||
To set the state of the 3G card when loading acer-wmi, pass:
|
||||
threeg=X (where X is 0 or 1)
|
||||
|
||||
Mail LED
|
||||
********
|
||||
|
||||
This can be found in most older Acer laptops supported by acer-wmi, and many
|
||||
newer ones - it is built into the 'mail' button, and blinks when active.
|
||||
|
||||
On newer (WMID) laptops though, we have no way of detecting the mail LED. If
|
||||
your laptop identifies itself in dmesg as a WMID model, then please try loading
|
||||
acer_acpi with:
|
||||
|
||||
force_series=2490
|
||||
|
||||
This will use a known alternative method of reading/ writing the mail LED. If
|
||||
it works, please report back to me with the DMI data from your laptop so this
|
||||
can be added to acer-wmi.
|
||||
|
||||
The LED is exposed through the LED subsystem, and can be found in:
|
||||
|
||||
/sys/devices/platform/acer-wmi/leds/acer-mail:green/
|
||||
|
||||
The mail LED is autodetected, so if you don't have one, the LED device won't
|
||||
be registered.
|
||||
|
||||
If you have a mail LED that is not green, please report this to me.
|
||||
|
||||
Backlight
|
||||
*********
|
||||
|
||||
The backlight brightness control is available on all acer-wmi supported
|
||||
hardware. The maximum brightness level is usually 15, but on some newer laptops
|
||||
it's 10 (this is again autodetected).
|
||||
|
||||
The backlight is exposed through the backlight subsystem, and can be found in:
|
||||
|
||||
/sys/devices/platform/acer-wmi/backlight/acer-wmi/
|
||||
|
||||
Credits
|
||||
*******
|
||||
|
||||
Olaf Tauber, who did the real hard work when he developed acerhk
|
||||
http://www.informatik.hu-berlin.de/~tauber/acerhk
|
||||
All the authors of laptop ACPI modules in the kernel, whose work
|
||||
was an inspiration in the early days of acer_acpi
|
||||
Mathieu Segaud, who solved the problem with having to modprobe the driver
|
||||
twice in acer_acpi 0.2.
|
||||
Jim Ramsay, who added support for the WMID interface
|
||||
Mark Smith, who started the original acer_acpi
|
||||
|
||||
And the many people who have used both acer_acpi and acer-wmi.
|
|
@ -114,4 +114,3 @@ Bugs/Limitations:
|
|||
sonypi driver (through /dev/sonypi) does not try to use the
|
||||
sony-laptop driver. In the future, spicctrl could try sonypi first,
|
||||
and if it isn't present, try sony-laptop instead.
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
ThinkPad ACPI Extras Driver
|
||||
|
||||
Version 0.17
|
||||
October 04th, 2007
|
||||
Version 0.19
|
||||
January 06th, 2008
|
||||
|
||||
Borislav Deianov <borislav@users.sf.net>
|
||||
Henrique de Moraes Holschuh <hmh@hmh.eng.br>
|
||||
|
@ -215,6 +215,11 @@ The following commands can be written to the /proc/acpi/ibm/hotkey file:
|
|||
... any other 8-hex-digit mask ...
|
||||
echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
|
||||
|
||||
The procfs interface does not support NVRAM polling control. So as to
|
||||
maintain maximum bug-to-bug compatibility, it does not report any masks,
|
||||
nor does it allow one to manipulate the hot key mask when the firmware
|
||||
does not support masks at all, even if NVRAM polling is in use.
|
||||
|
||||
sysfs notes:
|
||||
|
||||
hotkey_bios_enabled:
|
||||
|
@ -231,17 +236,26 @@ sysfs notes:
|
|||
to this value.
|
||||
|
||||
hotkey_enable:
|
||||
Enables/disables the hot keys feature, and reports
|
||||
current status of the hot keys feature.
|
||||
Enables/disables the hot keys feature in the ACPI
|
||||
firmware, and reports current status of the hot keys
|
||||
feature. Has no effect on the NVRAM hot key polling
|
||||
functionality.
|
||||
|
||||
0: disables the hot keys feature / feature disabled
|
||||
1: enables the hot keys feature / feature enabled
|
||||
|
||||
hotkey_mask:
|
||||
bit mask to enable driver-handling and ACPI event
|
||||
generation for each hot key (see above). Returns the
|
||||
current status of the hot keys mask, and allows one to
|
||||
modify it.
|
||||
bit mask to enable driver-handling (and depending on
|
||||
the firmware, ACPI event generation) for each hot key
|
||||
(see above). Returns the current status of the hot keys
|
||||
mask, and allows one to modify it.
|
||||
|
||||
Note: when NVRAM polling is active, the firmware mask
|
||||
will be different from the value returned by
|
||||
hotkey_mask. The driver will retain enabled bits for
|
||||
hotkeys that are under NVRAM polling even if the
|
||||
firmware refuses them, and will not set these bits on
|
||||
the firmware hot key mask.
|
||||
|
||||
hotkey_all_mask:
|
||||
bit mask that should enable event reporting for all
|
||||
|
@ -257,12 +271,48 @@ sysfs notes:
|
|||
handled by the firmware anyway. Echo it to
|
||||
hotkey_mask above, to use.
|
||||
|
||||
hotkey_source_mask:
|
||||
bit mask that selects which hot keys will the driver
|
||||
poll the NVRAM for. This is auto-detected by the driver
|
||||
based on the capabilities reported by the ACPI firmware,
|
||||
but it can be overridden at runtime.
|
||||
|
||||
Hot keys whose bits are set in both hotkey_source_mask
|
||||
and also on hotkey_mask are polled for in NVRAM. Only a
|
||||
few hot keys are available through CMOS NVRAM polling.
|
||||
|
||||
Warning: when in NVRAM mode, the volume up/down/mute
|
||||
keys are synthesized according to changes in the mixer,
|
||||
so you have to use volume up or volume down to unmute,
|
||||
as per the ThinkPad volume mixer user interface. When
|
||||
in ACPI event mode, volume up/down/mute are reported as
|
||||
separate events, but this behaviour may be corrected in
|
||||
future releases of this driver, in which case the
|
||||
ThinkPad volume mixer user interface semanthics will be
|
||||
enforced.
|
||||
|
||||
hotkey_poll_freq:
|
||||
frequency in Hz for hot key polling. It must be between
|
||||
0 and 25 Hz. Polling is only carried out when strictly
|
||||
needed.
|
||||
|
||||
Setting hotkey_poll_freq to zero disables polling, and
|
||||
will cause hot key presses that require NVRAM polling
|
||||
to never be reported.
|
||||
|
||||
Setting hotkey_poll_freq too low will cause repeated
|
||||
pressings of the same hot key to be misreported as a
|
||||
single key press, or to not even be detected at all.
|
||||
The recommended polling frequency is 10Hz.
|
||||
|
||||
hotkey_radio_sw:
|
||||
if the ThinkPad has a hardware radio switch, this
|
||||
attribute will read 0 if the switch is in the "radios
|
||||
disabled" postition, and 1 if the switch is in the
|
||||
"radios enabled" position.
|
||||
|
||||
This attribute has poll()/select() support.
|
||||
|
||||
hotkey_report_mode:
|
||||
Returns the state of the procfs ACPI event report mode
|
||||
filter for hot keys. If it is set to 1 (the default),
|
||||
|
@ -277,6 +327,25 @@ sysfs notes:
|
|||
May return -EPERM (write access locked out by module
|
||||
parameter) or -EACCES (read-only).
|
||||
|
||||
wakeup_reason:
|
||||
Set to 1 if the system is waking up because the user
|
||||
requested a bay ejection. Set to 2 if the system is
|
||||
waking up because the user requested the system to
|
||||
undock. Set to zero for normal wake-ups or wake-ups
|
||||
due to unknown reasons.
|
||||
|
||||
This attribute has poll()/select() support.
|
||||
|
||||
wakeup_hotunplug_complete:
|
||||
Set to 1 if the system was waken up because of an
|
||||
undock or bay ejection request, and that request
|
||||
was sucessfully completed. At this point, it might
|
||||
be useful to send the system back to sleep, at the
|
||||
user's choice. Refer to HKEY events 0x4003 and
|
||||
0x3003, below.
|
||||
|
||||
This attribute has poll()/select() support.
|
||||
|
||||
input layer notes:
|
||||
|
||||
A Hot key is mapped to a single input layer EV_KEY event, possibly
|
||||
|
@ -427,6 +496,23 @@ Non hot-key ACPI HKEY event map:
|
|||
The above events are not propagated by the driver, except for legacy
|
||||
compatibility purposes when hotkey_report_mode is set to 1.
|
||||
|
||||
0x2304 System is waking up from suspend to undock
|
||||
0x2305 System is waking up from suspend to eject bay
|
||||
0x2404 System is waking up from hibernation to undock
|
||||
0x2405 System is waking up from hibernation to eject bay
|
||||
|
||||
The above events are never propagated by the driver.
|
||||
|
||||
0x3003 Bay ejection (see 0x2x05) complete, can sleep again
|
||||
0x4003 Undocked (see 0x2x04), can sleep again
|
||||
0x5009 Tablet swivel: switched to tablet mode
|
||||
0x500A Tablet swivel: switched to normal mode
|
||||
0x500B Tablet pen insterted into its storage bay
|
||||
0x500C Tablet pen removed from its storage bay
|
||||
0x5010 Brightness level changed (newer Lenovo BIOSes)
|
||||
|
||||
The above events are propagated by the driver.
|
||||
|
||||
Compatibility notes:
|
||||
|
||||
ibm-acpi and thinkpad-acpi 0.15 (mainline kernels before 2.6.23) never
|
||||
|
@ -1263,3 +1349,17 @@ Sysfs interface changelog:
|
|||
and the hwmon class for libsensors4 (lm-sensors 3)
|
||||
compatibility. Moved all hwmon attributes to this
|
||||
new platform device.
|
||||
|
||||
0x020100: Marker for thinkpad-acpi with hot key NVRAM polling
|
||||
support. If you must, use it to know you should not
|
||||
start an userspace NVRAM poller (allows to detect when
|
||||
NVRAM is compiled out by the user because it is
|
||||
unneeded/undesired in the first place).
|
||||
0x020101: Marker for thinkpad-acpi with hot key NVRAM polling
|
||||
and proper hotkey_mask semanthics (version 8 of the
|
||||
NVRAM polling patch). Some development snapshots of
|
||||
0.18 had an earlier version that did strange things
|
||||
to hotkey_mask.
|
||||
|
||||
0x020200: Add poll()/select() support to the following attributes:
|
||||
hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
|
|
@ -39,12 +39,33 @@ LED Device Naming
|
|||
|
||||
Is currently of the form:
|
||||
|
||||
"devicename:colour"
|
||||
"devicename:colour:function"
|
||||
|
||||
There have been calls for LED properties such as colour to be exported as
|
||||
individual led class attributes. As a solution which doesn't incur as much
|
||||
overhead, I suggest these become part of the device name. The naming scheme
|
||||
above leaves scope for further attributes should they be needed.
|
||||
above leaves scope for further attributes should they be needed. If sections
|
||||
of the name don't apply, just leave that section blank.
|
||||
|
||||
|
||||
Hardware accelerated blink of LEDs
|
||||
==================================
|
||||
|
||||
Some LEDs can be programmed to blink without any CPU interaction. To
|
||||
support this feature, a LED driver can optionally implement the
|
||||
blink_set() function (see <linux/leds.h>). If implemeted, triggers can
|
||||
attempt to use it before falling back to software timers. The blink_set()
|
||||
function should return 0 if the blink setting is supported, or -EINVAL
|
||||
otherwise, which means that LED blinking will be handled by software.
|
||||
|
||||
The blink_set() function should choose a user friendly blinking
|
||||
value if it is called with *delay_on==0 && *delay_off==0 parameters. In
|
||||
this case the driver should give back the chosen value through delay_on
|
||||
and delay_off parameters to the leds subsystem.
|
||||
|
||||
Any call to the brightness_set() callback function should cancel the
|
||||
previously programmed hardware blinking function so setting the brightness
|
||||
to 0 can also cancel the blinking of the LED.
|
||||
|
||||
|
||||
Known Issues
|
||||
|
@ -55,10 +76,6 @@ would cause nightmare dependency issues. I see this as a minor issue
|
|||
compared to the benefits the simple trigger functionality brings. The
|
||||
rest of the LED subsystem can be modular.
|
||||
|
||||
Some leds can be programmed to flash in hardware. As this isn't a generic
|
||||
LED device property, this should be exported as a device specific sysfs
|
||||
attribute rather than part of the class if this functionality is required.
|
||||
|
||||
|
||||
Future Development
|
||||
==================
|
||||
|
|
|
@ -416,6 +416,16 @@ also have
|
|||
sectors in total that could need to be processed. The two
|
||||
numbers are separated by a '/' thus effectively showing one
|
||||
value, a fraction of the process that is complete.
|
||||
A 'select' on this attribute will return when resync completes,
|
||||
when it reaches the current sync_max (below) and possibly at
|
||||
other times.
|
||||
|
||||
sync_max
|
||||
This is a number of sectors at which point a resync/recovery
|
||||
process will pause. When a resync is active, the value can
|
||||
only ever be increased, never decreased. The value of 'max'
|
||||
effectively disables the limit.
|
||||
|
||||
|
||||
sync_speed
|
||||
This shows the current actual speed, in K/sec, of the current
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
=========================
|
||||
MN10300 FUNCTION CALL ABI
|
||||
=========================
|
||||
|
||||
=======
|
||||
GENERAL
|
||||
=======
|
||||
|
||||
The MN10300/AM33 kernel runs in little-endian mode; big-endian mode is not
|
||||
supported.
|
||||
|
||||
The stack grows downwards, and should always be 32-bit aligned. There are
|
||||
separate stack pointer registers for userspace and the kernel.
|
||||
|
||||
|
||||
================
|
||||
ARGUMENT PASSING
|
||||
================
|
||||
|
||||
The first two arguments (assuming up to 32-bits per argument) to a function are
|
||||
passed in the D0 and D1 registers respectively; all other arguments are passed
|
||||
on the stack.
|
||||
|
||||
If 64-bit arguments are being passed, then they are never split between
|
||||
registers and the stack. If the first argument is a 64-bit value, it will be
|
||||
passed in D0:D1. If the first argument is not a 64-bit value, but the second
|
||||
is, the second will be passed entirely on the stack and D1 will be unused.
|
||||
|
||||
Arguments smaller than 32-bits are not coelesced within a register or a stack
|
||||
word. For example, two byte-sized arguments will always be passed in separate
|
||||
registers or word-sized stack slots.
|
||||
|
||||
|
||||
=================
|
||||
CALLING FUNCTIONS
|
||||
=================
|
||||
|
||||
The caller must allocate twelve bytes on the stack for the callee's use before
|
||||
it inserts a CALL instruction. The CALL instruction will write into the TOS
|
||||
word, but won't actually modify the stack pointer; similarly, the RET
|
||||
instruction reads from the TOS word of the stack, but doesn't move the stack
|
||||
pointer beyond it.
|
||||
|
||||
|
||||
Stack:
|
||||
| |
|
||||
| |
|
||||
|---------------| SP+20
|
||||
| 4th Arg |
|
||||
|---------------| SP+16
|
||||
| 3rd Arg |
|
||||
|---------------| SP+12
|
||||
| D1 Save Slot |
|
||||
|---------------| SP+8
|
||||
| D0 Save Slot |
|
||||
|---------------| SP+4
|
||||
| Return Addr |
|
||||
|---------------| SP
|
||||
| |
|
||||
| |
|
||||
|
||||
|
||||
The caller must leave space on the stack (hence an allocation of twelve bytes)
|
||||
in which the callee may store the first two arguments.
|
||||
|
||||
|
||||
============
|
||||
RETURN VALUE
|
||||
============
|
||||
|
||||
The return value is passed in D0 for an integer (or D0:D1 for a 64-bit value),
|
||||
or A0 for a pointer.
|
||||
|
||||
If the return value is a value larger than 64-bits, or is a structure or an
|
||||
array, then a hidden first argument will be passed to the callee by the caller:
|
||||
this will point to a piece of memory large enough to hold the result of the
|
||||
function. In this case, the callee will return the value in that piece of
|
||||
memory, and no value will be returned in D0 or A0.
|
||||
|
||||
|
||||
===================
|
||||
REGISTER CLOBBERING
|
||||
===================
|
||||
|
||||
The values in certain registers may be clobbered by the callee, and other
|
||||
values must be saved:
|
||||
|
||||
Clobber: D0-D1, A0-A1, E0-E3
|
||||
Save: D2-D3, A2-A3, E4-E7, SP
|
||||
|
||||
All other non-supervisor-only registers are clobberable (such as MDR, MCRL,
|
||||
MCRH).
|
||||
|
||||
|
||||
=================
|
||||
SPECIAL REGISTERS
|
||||
=================
|
||||
|
||||
Certain ordinary registers may carry special usage for the compiler:
|
||||
|
||||
A3: Frame pointer
|
||||
E2: TLS pointer
|
||||
|
||||
|
||||
==========
|
||||
KERNEL ABI
|
||||
==========
|
||||
|
||||
The kernel may use a slightly different ABI internally.
|
||||
|
||||
(*) E2
|
||||
|
||||
If CONFIG_MN10300_CURRENT_IN_E2 is defined, then the current task pointer
|
||||
will be kept in the E2 register, and that register will be marked
|
||||
unavailable for the compiler to use as a scratch register.
|
||||
|
||||
Normally the kernel uses something like:
|
||||
|
||||
MOV SP,An
|
||||
AND 0xFFFFE000,An
|
||||
MOV (An),Rm // Rm holds current
|
||||
MOV (yyy,Rm) // Access current->yyy
|
||||
|
||||
To find the address of current; but since this option permits current to
|
||||
be carried globally in an register, it can use:
|
||||
|
||||
MOV (yyy,E2) // Access current->yyy
|
||||
|
||||
instead.
|
||||
|
||||
|
||||
===============
|
||||
SYSTEM CALL ABI
|
||||
===============
|
||||
|
||||
System calls are called with the following convention:
|
||||
|
||||
REGISTER ENTRY EXIT
|
||||
=============== ======================= =======================
|
||||
D0 Syscall number Return value
|
||||
A0 1st syscall argument Saved
|
||||
D1 2nd syscall argument Saved
|
||||
A3 3rd syscall argument Saved
|
||||
A2 4th syscall argument Saved
|
||||
D3 5th syscall argument Saved
|
||||
D2 6th syscall argument Saved
|
||||
|
||||
All other registers are saved. The layout is a consequence of the way the MOVM
|
||||
instruction stores registers onto the stack.
|
|
@ -0,0 +1,60 @@
|
|||
=========================================
|
||||
PART-SPECIFIC SOURCE COMPARTMENTALISATION
|
||||
=========================================
|
||||
|
||||
The sources for various parts are compartmentalised at two different levels:
|
||||
|
||||
(1) Processor level
|
||||
|
||||
The "processor level" is a CPU core plus the other on-silicon
|
||||
peripherals.
|
||||
|
||||
Processor-specific header files are divided among directories in a similar
|
||||
way to the CPU level:
|
||||
|
||||
(*) include/asm-mn10300/proc-mn103e010/
|
||||
|
||||
Support for the AM33v2 CPU core.
|
||||
|
||||
The appropriate processor is selected by a CONFIG_MN10300_PROC_YYYY option
|
||||
from the "Processor support" choice menu in the arch/mn10300/Kconfig file.
|
||||
|
||||
|
||||
(2) Unit level
|
||||
|
||||
The "unit level" is a processor plus all the external peripherals
|
||||
controlled by that processor.
|
||||
|
||||
Unit-specific header files are divided among directories in a similar way
|
||||
to the CPU level; not only that, but specific sources may also be
|
||||
segregated into separate directories under the arch directory:
|
||||
|
||||
(*) include/asm-mn10300/unit-asb2303/
|
||||
(*) arch/mn10300/unit-asb2303/
|
||||
|
||||
Support for the ASB2303 board with an ASB2308 daughter board.
|
||||
|
||||
(*) include/asm-mn10300/unit-asb2305/
|
||||
(*) arch/mn10300/unit-asb2305/
|
||||
|
||||
Support for the ASB2305 board.
|
||||
|
||||
The appropriate processor is selected by a CONFIG_MN10300_UNIT_ZZZZ option
|
||||
from the "Unit type" choice menu in the arch/mn10300/Kconfig file.
|
||||
|
||||
|
||||
============
|
||||
COMPILE TIME
|
||||
============
|
||||
|
||||
When the kernel is compiled, symbolic links will be made in the asm header file
|
||||
directory for this arch:
|
||||
|
||||
include/asm-mn10300/proc => include/asm-mn10300/proc-YYYY/
|
||||
include/asm-mn10300/unit => include/asm-mn10300/unit-ZZZZ/
|
||||
|
||||
So that the header files contained in those directories can be accessed without
|
||||
lots of #ifdef-age.
|
||||
|
||||
The appropriate arch/mn10300/unit-ZZZZ directory will also be entered by the
|
||||
compilation process; all other unit-specific directories will be ignored.
|
|
@ -33,8 +33,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors:
|
|||
and can be used (e.g. for SET_NETDEV_DEV) by using
|
||||
handle_to_dev(client_handle_t * handle).
|
||||
|
||||
* Convert internal I/O port addresses to unsigned long (as of 2.6.11)
|
||||
ioaddr_t should be replaced by kio_addr_t in PCMCIA card drivers.
|
||||
* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
|
||||
ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
|
||||
|
||||
* irq_mask and irq_list parameters (as of 2.6.11)
|
||||
The irq_mask and irq_list parameters should no longer be used in
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
PM quality of Service interface.
|
||||
|
||||
This interface provides a kernel and user mode interface for registering
|
||||
performance expectations by drivers, subsystems and user space applications on
|
||||
one of the parameters.
|
||||
|
||||
Currently we have {cpu_dma_latency, network_latency, network_throughput} as the
|
||||
initial set of pm_qos parameters.
|
||||
|
||||
The infrastructure exposes multiple misc device nodes one per implemented
|
||||
parameter. The set of parameters implement is defined by pm_qos_power_init()
|
||||
and pm_qos_params.h. This is done because having the available parameters
|
||||
being runtime configurable or changeable from a driver was seen as too easy to
|
||||
abuse.
|
||||
|
||||
For each parameter a list of performance requirements is maintained along with
|
||||
an aggregated target value. The aggregated target value is updated with
|
||||
changes to the requirement list or elements of the list. Typically the
|
||||
aggregated target value is simply the max or min of the requirement values held
|
||||
in the parameter list elements.
|
||||
|
||||
From kernel mode the use of this interface is simple:
|
||||
pm_qos_add_requirement(param_id, name, target_value):
|
||||
Will insert a named element in the list for that identified PM_QOS parameter
|
||||
with the target value. Upon change to this list the new target is recomputed
|
||||
and any registered notifiers are called only if the target value is now
|
||||
different.
|
||||
|
||||
pm_qos_update_requirement(param_id, name, new_target_value):
|
||||
Will search the list identified by the param_id for the named list element and
|
||||
then update its target value, calling the notification tree if the aggregated
|
||||
target is changed. with that name is already registered.
|
||||
|
||||
pm_qos_remove_requirement(param_id, name):
|
||||
Will search the identified list for the named element and remove it, after
|
||||
removal it will update the aggregate target and call the notification tree if
|
||||
the target was changed as a result of removing the named requirement.
|
||||
|
||||
|
||||
From user mode:
|
||||
Only processes can register a pm_qos requirement. To provide for automatic
|
||||
cleanup for process the interface requires the process to register its
|
||||
parameter requirements in the following way:
|
||||
|
||||
To register the default pm_qos target for the specific parameter, the process
|
||||
must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
|
||||
|
||||
As long as the device node is held open that process has a registered
|
||||
requirement on the parameter. The name of the requirement is "process_<PID>"
|
||||
derived from the current->pid from within the open system call.
|
||||
|
||||
To change the requested target value the process needs to write a s32 value to
|
||||
the open device node. This translates to a pm_qos_update_requirement call.
|
||||
|
||||
To remove the user mode request for a target value simply close the device
|
||||
node.
|
||||
|
||||
|
||||
|
|
@ -386,6 +386,11 @@ before suspending; then remount them after resuming.
|
|||
There is a work-around for this problem. For more information, see
|
||||
Documentation/usb/persist.txt.
|
||||
|
||||
Q: Can I suspend-to-disk using a swap partition under LVM?
|
||||
|
||||
A: No. You can suspend successfully, but you'll not be able to
|
||||
resume. uswsusp should be able to work with LVM. See suspend.sf.net.
|
||||
|
||||
Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
|
||||
compiled with the similar configuration files. Anyway I found that
|
||||
suspend to disk (and resume) is much slower on 2.6.16 compared to
|
||||
|
|
|
@ -57,6 +57,7 @@ Table of Contents
|
|||
n) 4xx/Axon EMAC ethernet nodes
|
||||
o) Xilinx IP cores
|
||||
p) Freescale Synchronous Serial Interface
|
||||
q) USB EHCI controllers
|
||||
|
||||
VII - Specifying interrupt information for devices
|
||||
1) interrupts property
|
||||
|
@ -2577,6 +2578,20 @@ platforms are moved over to use the flattened-device-tree model.
|
|||
Requred properties:
|
||||
- current-speed : Baud rate of uartlite
|
||||
|
||||
v) Xilinx hwicap
|
||||
|
||||
Xilinx hwicap devices provide access to the configuration logic
|
||||
of the FPGA through the Internal Configuration Access Port
|
||||
(ICAP). The ICAP enables partial reconfiguration of the FPGA,
|
||||
readback of the configuration information, and some control over
|
||||
'warm boots' of the FPGA fabric.
|
||||
|
||||
Required properties:
|
||||
- xlnx,family : The family of the FPGA, necessary since the
|
||||
capabilities of the underlying ICAP hardware
|
||||
differ between different families. May be
|
||||
'virtex2p', 'virtex4', or 'virtex5'.
|
||||
|
||||
p) Freescale Synchronous Serial Interface
|
||||
|
||||
The SSI is a serial device that communicates with audio codecs. It can
|
||||
|
@ -2775,6 +2790,33 @@ platforms are moved over to use the flattened-device-tree model.
|
|||
interrupt-parent = < &ipic >;
|
||||
};
|
||||
|
||||
q) USB EHCI controllers
|
||||
|
||||
Required properties:
|
||||
- compatible : should be "usb-ehci".
|
||||
- reg : should contain at least address and length of the standard EHCI
|
||||
register set for the device. Optional platform-dependent registers
|
||||
(debug-port or other) can be also specified here, but only after
|
||||
definition of standard EHCI registers.
|
||||
- interrupts : one EHCI interrupt should be described here.
|
||||
If device registers are implemented in big endian mode, the device
|
||||
node should have "big-endian-regs" property.
|
||||
If controller implementation operates with big endian descriptors,
|
||||
"big-endian-desc" property should be specified.
|
||||
If both big endian registers and descriptors are used by the controller
|
||||
implementation, "big-endian" property can be specified instead of having
|
||||
both "big-endian-regs" and "big-endian-desc".
|
||||
|
||||
Example (Sequoia 440EPx):
|
||||
ehci@e0000300 {
|
||||
compatible = "ibm,usb-ehci-440epx", "usb-ehci";
|
||||
interrupt-parent = <&UIC0>;
|
||||
interrupts = <1a 4>;
|
||||
reg = <0 e0000300 90 0 e0000390 70>;
|
||||
big-endian;
|
||||
};
|
||||
|
||||
|
||||
More devices will be defined as this spec matures.
|
||||
|
||||
VII - Specifying interrupt information for devices
|
||||
|
|
|
@ -182,8 +182,8 @@ driver returns ENOIOCTLCMD. Some common examples:
|
|||
since the frequency is stored in the irq_freq member of the rtc_device
|
||||
structure. Your driver needs to initialize the irq_freq member during
|
||||
init. Make sure you check the requested frequency is in range of your
|
||||
hardware in the irq_set_freq function. If you cannot actually change
|
||||
the frequency, just return -ENOTTY.
|
||||
hardware in the irq_set_freq function. If it isn't, return -EINVAL. If
|
||||
you cannot actually change the frequency, do not define irq_set_freq.
|
||||
|
||||
If all else fails, check out the rtc-test.c driver!
|
||||
|
||||
|
@ -268,8 +268,8 @@ int main(int argc, char **argv)
|
|||
/* This read will block */
|
||||
retval = read(fd, &data, sizeof(unsigned long));
|
||||
if (retval == -1) {
|
||||
perror("read");
|
||||
exit(errno);
|
||||
perror("read");
|
||||
exit(errno);
|
||||
}
|
||||
fprintf(stderr, " %d",i);
|
||||
fflush(stderr);
|
||||
|
@ -326,11 +326,11 @@ test_READ:
|
|||
rtc_tm.tm_sec %= 60;
|
||||
rtc_tm.tm_min++;
|
||||
}
|
||||
if (rtc_tm.tm_min == 60) {
|
||||
if (rtc_tm.tm_min == 60) {
|
||||
rtc_tm.tm_min = 0;
|
||||
rtc_tm.tm_hour++;
|
||||
}
|
||||
if (rtc_tm.tm_hour == 24)
|
||||
if (rtc_tm.tm_hour == 24)
|
||||
rtc_tm.tm_hour = 0;
|
||||
|
||||
retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
|
||||
|
@ -407,8 +407,8 @@ test_PIE:
|
|||
"\n...Periodic IRQ rate is fixed\n");
|
||||
goto done;
|
||||
}
|
||||
perror("RTC_IRQP_SET ioctl");
|
||||
exit(errno);
|
||||
perror("RTC_IRQP_SET ioctl");
|
||||
exit(errno);
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n%ldHz:\t", tmp);
|
||||
|
@ -417,27 +417,27 @@ test_PIE:
|
|||
/* Enable periodic interrupts */
|
||||
retval = ioctl(fd, RTC_PIE_ON, 0);
|
||||
if (retval == -1) {
|
||||
perror("RTC_PIE_ON ioctl");
|
||||
exit(errno);
|
||||
perror("RTC_PIE_ON ioctl");
|
||||
exit(errno);
|
||||
}
|
||||
|
||||
for (i=1; i<21; i++) {
|
||||
/* This blocks */
|
||||
retval = read(fd, &data, sizeof(unsigned long));
|
||||
if (retval == -1) {
|
||||
perror("read");
|
||||
exit(errno);
|
||||
}
|
||||
fprintf(stderr, " %d",i);
|
||||
fflush(stderr);
|
||||
irqcount++;
|
||||
/* This blocks */
|
||||
retval = read(fd, &data, sizeof(unsigned long));
|
||||
if (retval == -1) {
|
||||
perror("read");
|
||||
exit(errno);
|
||||
}
|
||||
fprintf(stderr, " %d",i);
|
||||
fflush(stderr);
|
||||
irqcount++;
|
||||
}
|
||||
|
||||
/* Disable periodic interrupts */
|
||||
retval = ioctl(fd, RTC_PIE_OFF, 0);
|
||||
if (retval == -1) {
|
||||
perror("RTC_PIE_OFF ioctl");
|
||||
exit(errno);
|
||||
perror("RTC_PIE_OFF ioctl");
|
||||
exit(errno);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
|
||||
|
||||
Real-Time group scheduling.
|
||||
|
||||
The problem space:
|
||||
|
||||
In order to schedule multiple groups of realtime tasks each group must
|
||||
be assigned a fixed portion of the CPU time available. Without a minimum
|
||||
guarantee a realtime group can obviously fall short. A fuzzy upper limit
|
||||
is of no use since it cannot be relied upon. Which leaves us with just
|
||||
the single fixed portion.
|
||||
|
||||
CPU time is divided by means of specifying how much time can be spent
|
||||
running in a given period. Say a frame fixed realtime renderer must
|
||||
deliver 25 frames a second, which yields a period of 0.04s. Now say
|
||||
it will also have to play some music and respond to input, leaving it
|
||||
with around 80% for the graphics. We can then give this group a runtime
|
||||
of 0.8 * 0.04s = 0.032s.
|
||||
|
||||
This way the graphics group will have a 0.04s period with a 0.032s runtime
|
||||
limit.
|
||||
|
||||
Now if the audio thread needs to refill the DMA buffer every 0.005s, but
|
||||
needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s
|
||||
= 0.00015s.
|
||||
|
||||
|
||||
The Interface:
|
||||
|
||||
system wide:
|
||||
|
||||
/proc/sys/kernel/sched_rt_period_ms
|
||||
/proc/sys/kernel/sched_rt_runtime_us
|
||||
|
||||
CONFIG_FAIR_USER_SCHED
|
||||
|
||||
/sys/kernel/uids/<uid>/cpu_rt_runtime_us
|
||||
|
||||
or
|
||||
|
||||
CONFIG_FAIR_CGROUP_SCHED
|
||||
|
||||
/cgroup/<cgroup>/cpu.rt_runtime_us
|
||||
|
||||
[ time is specified in us because the interface is s32; this gives an
|
||||
operating range of ~35m to 1us ]
|
||||
|
||||
The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
|
||||
|
||||
A runtime of -1 specifies runtime == period, ie. no limit.
|
||||
|
||||
New groups get the period from /proc/sys/kernel/sched_rt_period_us and
|
||||
a runtime of 0.
|
||||
|
||||
Settings are constrained to:
|
||||
|
||||
\Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
|
||||
|
||||
in order to keep the configuration schedulable.
|
|
@ -0,0 +1,16 @@
|
|||
00-INDEX
|
||||
- this file.
|
||||
sched-arch.txt
|
||||
- CPU Scheduler implementation hints for architecture specific code.
|
||||
sched-coding.txt
|
||||
- reference for various scheduler-related methods in the O(1) scheduler.
|
||||
sched-design.txt
|
||||
- goals, design and implementation of the Linux O(1) scheduler.
|
||||
sched-design-CFS.txt
|
||||
- goals, design and implementation of the Complete Fair Scheduler.
|
||||
sched-domains.txt
|
||||
- information on scheduling domains.
|
||||
sched-nice-design.txt
|
||||
- How and why the scheduler's nice levels are implemented.
|
||||
sched-stats.txt
|
||||
- information on schedstats (Linux Scheduler Statistics).
|
|
@ -68,4 +68,45 @@
|
|||
** 2. modify the arcmsr_pci_slot_reset function
|
||||
** 3. modify the arcmsr_pci_ers_disconnect_forepart function
|
||||
** 4. modify the arcmsr_pci_ers_need_reset_forepart function
|
||||
** 1.20.00.15 09/27/2007 Erich Chen & Nick Cheng
|
||||
** 1. add arcmsr_enable_eoi_mode() on adapter Type B
|
||||
** 2. add readl(reg->iop2drv_doorbell_reg) in arcmsr_handle_hbb_isr()
|
||||
** in case of the doorbell interrupt clearance is cached
|
||||
** 1.20.00.15 10/01/2007 Erich Chen & Nick Cheng
|
||||
** 1. modify acb->devstate[i][j]
|
||||
** as ARECA_RAID_GOOD instead of
|
||||
** ARECA_RAID_GONE in arcmsr_alloc_ccb_pool
|
||||
** 1.20.00.15 11/06/2007 Erich Chen & Nick Cheng
|
||||
** 1. add conditional declaration for
|
||||
** arcmsr_pci_error_detected() and
|
||||
** arcmsr_pci_slot_reset
|
||||
** 1.20.00.15 11/23/2007 Erich Chen & Nick Cheng
|
||||
** 1.check if the sg list member number
|
||||
** exceeds arcmsr default limit in arcmsr_build_ccb()
|
||||
** 2.change the returned value type of arcmsr_build_ccb()
|
||||
** from "void" to "int"
|
||||
** 3.add the conditional check if arcmsr_build_ccb()
|
||||
** returns FAILED
|
||||
** 1.20.00.15 12/04/2007 Erich Chen & Nick Cheng
|
||||
** 1. modify arcmsr_drain_donequeue() to ignore unknown
|
||||
** command and let kernel process command timeout.
|
||||
** This could handle IO request violating max. segments
|
||||
** while Linux XFS over DM-CRYPT.
|
||||
** Thanks to Milan Broz's comments <mbroz@redhat.com>
|
||||
** 1.20.00.15 12/24/2007 Erich Chen & Nick Cheng
|
||||
** 1.fix the portability problems
|
||||
** 2.fix type B where we should _not_ iounmap() acb->pmu;
|
||||
** it's not ioremapped.
|
||||
** 3.add return -ENOMEM if ioremap() fails
|
||||
** 4.transfer IS_SG64_ADDR w/ cpu_to_le32()
|
||||
** in arcmsr_build_ccb
|
||||
** 5. modify acb->devstate[i][j] as ARECA_RAID_GONE instead of
|
||||
** ARECA_RAID_GOOD in arcmsr_alloc_ccb_pool()
|
||||
** 6.fix arcmsr_cdb->Context as (unsigned long)arcmsr_cdb
|
||||
** 7.add the checking state of
|
||||
** (outbound_intstatus & ARCMSR_MU_OUTBOUND_HANDLE_INT) == 0
|
||||
** in arcmsr_handle_hba_isr
|
||||
** 8.replace pci_alloc_consistent()/pci_free_consistent() with kmalloc()/kfree() in arcmsr_iop_message_xfer()
|
||||
** 9. fix the release of dma memory for type B in arcmsr_free_ccb_pool()
|
||||
** 10.fix the arcmsr_polling_hbb_ccbdone()
|
||||
**************************************************************************
|
||||
|
|
|
@ -1407,7 +1407,7 @@ Credits
|
|||
=======
|
||||
The following people have contributed to this document:
|
||||
Mike Anderson <andmike at us dot ibm dot com>
|
||||
James Bottomley <James dot Bottomley at steeleye dot com>
|
||||
James Bottomley <James dot Bottomley at hansenpartnership dot com>
|
||||
Patrick Mansfield <patmans at us dot ibm dot com>
|
||||
Christoph Hellwig <hch at infradead dot org>
|
||||
Doug Ledford <dledford at redhat dot com>
|
||||
|
|
|
@ -23,6 +23,7 @@ Currently, these files are in /proc/sys/fs:
|
|||
- inode-max
|
||||
- inode-nr
|
||||
- inode-state
|
||||
- nr_open
|
||||
- overflowuid
|
||||
- overflowgid
|
||||
- suid_dumpable
|
||||
|
@ -91,6 +92,15 @@ usage of file handles and you don't need to increase the maximum.
|
|||
|
||||
==============================================================
|
||||
|
||||
nr_open:
|
||||
|
||||
This denotes the maximum number of file-handles a process can
|
||||
allocate. Default value is 1024*1024 (1048576) which should be
|
||||
enough for most machines. Actual limit depends on RLIMIT_NOFILE
|
||||
resource limit.
|
||||
|
||||
==============================================================
|
||||
|
||||
inode-max, inode-nr & inode-state:
|
||||
|
||||
As with file handles, the kernel allocates the inode structures
|
||||
|
|
|
@ -29,7 +29,7 @@ show up in /proc/sys/kernel:
|
|||
- java-interpreter [ binfmt_java, obsolete ]
|
||||
- kstack_depth_to_print [ X86 only ]
|
||||
- l2cr [ PPC only ]
|
||||
- modprobe ==> Documentation/kmod.txt
|
||||
- modprobe ==> Documentation/debugging-modules.txt
|
||||
- msgmax
|
||||
- msgmnb
|
||||
- msgmni
|
||||
|
@ -41,6 +41,7 @@ show up in /proc/sys/kernel:
|
|||
- pid_max
|
||||
- powersave-nap [ PPC only ]
|
||||
- printk
|
||||
- randomize_va_space
|
||||
- real-root-dev ==> Documentation/initrd.txt
|
||||
- reboot-cmd [ SPARC only ]
|
||||
- rtsig-max
|
||||
|
@ -280,6 +281,34 @@ send before ratelimiting kicks in.
|
|||
|
||||
==============================================================
|
||||
|
||||
randomize-va-space:
|
||||
|
||||
This option can be used to select the type of process address
|
||||
space randomization that is used in the system, for architectures
|
||||
that support this feature.
|
||||
|
||||
0 - Turn the process address space randomization off by default.
|
||||
|
||||
1 - Make the addresses of mmap base, stack and VDSO page randomized.
|
||||
This, among other things, implies that shared libraries will be
|
||||
loaded to random addresses. Also for PIE-linked binaries, the location
|
||||
of code start is randomized.
|
||||
|
||||
With heap randomization, the situation is a little bit more
|
||||
complicated.
|
||||
There a few legacy applications out there (such as some ancient
|
||||
versions of libc.so.5 from 1996) that assume that brk area starts
|
||||
just after the end of the code+bss. These applications break when
|
||||
start of the brk area is randomized. There are however no known
|
||||
non-legacy applications that would be broken this way, so for most
|
||||
systems it is safe to choose full randomization. However there is
|
||||
a CONFIG_COMPAT_BRK option for systems with ancient and/or broken
|
||||
binaries, that makes heap non-randomized, but keeps all other
|
||||
parts of process address space randomized if randomize_va_space
|
||||
sysctl is turned on.
|
||||
|
||||
==============================================================
|
||||
|
||||
reboot-cmd: (Sparc only)
|
||||
|
||||
??? This seems to be a way to give an argument to the Sparc
|
||||
|
|
|
@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm:
|
|||
- dirty_background_ratio
|
||||
- dirty_expire_centisecs
|
||||
- dirty_writeback_centisecs
|
||||
- highmem_is_dirtyable (only if CONFIG_HIGHMEM set)
|
||||
- max_map_count
|
||||
- min_free_kbytes
|
||||
- laptop_mode
|
||||
|
@ -31,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
|
|||
- min_unmapped_ratio
|
||||
- min_slab_ratio
|
||||
- panic_on_oom
|
||||
- oom_dump_tasks
|
||||
- oom_kill_allocating_task
|
||||
- mmap_min_address
|
||||
- numa_zonelist_order
|
||||
|
@ -40,9 +42,9 @@ Currently, these files are in /proc/sys/vm:
|
|||
==============================================================
|
||||
|
||||
dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
|
||||
dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
|
||||
block_dump, swap_token_timeout, drop-caches,
|
||||
hugepages_treat_as_movable:
|
||||
dirty_writeback_centisecs, highmem_is_dirtyable,
|
||||
vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout,
|
||||
drop-caches, hugepages_treat_as_movable:
|
||||
|
||||
See Documentation/filesystems/proc.txt
|
||||
|
||||
|
@ -231,6 +233,27 @@ according to your policy of failover.
|
|||
|
||||
=============================================================
|
||||
|
||||
oom_dump_tasks
|
||||
|
||||
Enables a system-wide task dump (excluding kernel threads) to be
|
||||
produced when the kernel performs an OOM-killing and includes such
|
||||
information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and
|
||||
name. This is helpful to determine why the OOM killer was invoked
|
||||
and to identify the rogue task that caused it.
|
||||
|
||||
If this is set to zero, this information is suppressed. On very
|
||||
large systems with thousands of tasks it may not be feasible to dump
|
||||
the memory state information for each one. Such systems should not
|
||||
be forced to incur a performance penalty in OOM conditions when the
|
||||
information may not be desired.
|
||||
|
||||
If this is set to non-zero, this information is shown whenever the
|
||||
OOM killer actually kills a memory-hogging task.
|
||||
|
||||
The default value is 0.
|
||||
|
||||
=============================================================
|
||||
|
||||
oom_kill_allocating_task
|
||||
|
||||
This enables or disables killing the OOM-triggering task in
|
||||
|
|
|
@ -0,0 +1,245 @@
|
|||
Generic Thermal Sysfs driver How To
|
||||
=========================
|
||||
|
||||
Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
|
||||
|
||||
Updated: 2 January 2008
|
||||
|
||||
Copyright (c) 2008 Intel Corporation
|
||||
|
||||
|
||||
0. Introduction
|
||||
|
||||
The generic thermal sysfs provides a set of interfaces for thermal zone devices (sensors)
|
||||
and thermal cooling devices (fan, processor...) to register with the thermal management
|
||||
solution and to be a part of it.
|
||||
|
||||
This how-to focuses on enabling new thermal zone and cooling devices to participate
|
||||
in thermal management.
|
||||
This solution is platform independent and any type of thermal zone devices and
|
||||
cooling devices should be able to make use of the infrastructure.
|
||||
|
||||
The main task of the thermal sysfs driver is to expose thermal zone attributes as well
|
||||
as cooling device attributes to the user space.
|
||||
An intelligent thermal management application can make decisions based on inputs
|
||||
from thermal zone attributes (the current temperature and trip point temperature)
|
||||
and throttle appropriate devices.
|
||||
|
||||
[0-*] denotes any positive number starting from 0
|
||||
[1-*] denotes any positive number starting from 1
|
||||
|
||||
1. thermal sysfs driver interface functions
|
||||
|
||||
1.1 thermal zone device interface
|
||||
1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *name, int trips,
|
||||
void *devdata, struct thermal_zone_device_ops *ops)
|
||||
|
||||
This interface function adds a new thermal zone device (sensor) to
|
||||
/sys/class/thermal folder as thermal_zone[0-*].
|
||||
It tries to bind all the thermal cooling devices registered at the same time.
|
||||
|
||||
name: the thermal zone name.
|
||||
trips: the total number of trip points this thermal zone supports.
|
||||
devdata: device private data
|
||||
ops: thermal zone device call-backs.
|
||||
.bind: bind the thermal zone device with a thermal cooling device.
|
||||
.unbind: unbind the thermal zone device with a thermal cooling device.
|
||||
.get_temp: get the current temperature of the thermal zone.
|
||||
.get_mode: get the current mode (user/kernel) of the thermal zone.
|
||||
"kernel" means thermal management is done in kernel.
|
||||
"user" will prevent kernel thermal driver actions upon trip points
|
||||
so that user applications can take charge of thermal management.
|
||||
.set_mode: set the mode (user/kernel) of the thermal zone.
|
||||
.get_trip_type: get the type of certain trip point.
|
||||
.get_trip_temp: get the temperature above which the certain trip point
|
||||
will be fired.
|
||||
|
||||
1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
|
||||
|
||||
This interface function removes the thermal zone device.
|
||||
It deletes the corresponding entry form /sys/class/thermal folder and unbind all
|
||||
the thermal cooling devices it uses.
|
||||
|
||||
1.2 thermal cooling device interface
|
||||
1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
|
||||
void *devdata, struct thermal_cooling_device_ops *)
|
||||
|
||||
This interface function adds a new thermal cooling device (fan/processor/...) to
|
||||
/sys/class/thermal/ folder as cooling_device[0-*].
|
||||
It tries to bind itself to all the thermal zone devices register at the same time.
|
||||
name: the cooling device name.
|
||||
devdata: device private data.
|
||||
ops: thermal cooling devices call-backs.
|
||||
.get_max_state: get the Maximum throttle state of the cooling device.
|
||||
.get_cur_state: get the Current throttle state of the cooling device.
|
||||
.set_cur_state: set the Current throttle state of the cooling device.
|
||||
|
||||
1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
|
||||
|
||||
This interface function remove the thermal cooling device.
|
||||
It deletes the corresponding entry form /sys/class/thermal folder and unbind
|
||||
itself from all the thermal zone devices using it.
|
||||
|
||||
1.3 interface for binding a thermal zone device with a thermal cooling device
|
||||
1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
|
||||
int trip, struct thermal_cooling_device *cdev);
|
||||
|
||||
This interface function bind a thermal cooling device to the certain trip point
|
||||
of a thermal zone device.
|
||||
This function is usually called in the thermal zone device .bind callback.
|
||||
tz: the thermal zone device
|
||||
cdev: thermal cooling device
|
||||
trip: indicates which trip point the cooling devices is associated with
|
||||
in this thermal zone.
|
||||
|
||||
1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
|
||||
int trip, struct thermal_cooling_device *cdev);
|
||||
|
||||
This interface function unbind a thermal cooling device from the certain trip point
|
||||
of a thermal zone device.
|
||||
This function is usually called in the thermal zone device .unbind callback.
|
||||
tz: the thermal zone device
|
||||
cdev: thermal cooling device
|
||||
trip: indicates which trip point the cooling devices is associated with
|
||||
in this thermal zone.
|
||||
|
||||
2. sysfs attributes structure
|
||||
|
||||
RO read only value
|
||||
RW read/write value
|
||||
|
||||
All thermal sysfs attributes will be represented under /sys/class/thermal
|
||||
|
||||
Thermal zone device sys I/F, created once it's registered:
|
||||
|thermal_zone[0-*]:
|
||||
|-----type: Type of the thermal zone
|
||||
|-----temp: Current temperature
|
||||
|-----mode: Working mode of the thermal zone
|
||||
|-----trip_point_[0-*]_temp: Trip point temperature
|
||||
|-----trip_point_[0-*]_type: Trip point type
|
||||
|
||||
Thermal cooling device sys I/F, created once it's registered:
|
||||
|cooling_device[0-*]:
|
||||
|-----type : Type of the cooling device(processor/fan/...)
|
||||
|-----max_state: Maximum cooling state of the cooling device
|
||||
|-----cur_state: Current cooling state of the cooling device
|
||||
|
||||
|
||||
These two dynamic attributes are created/removed in pairs.
|
||||
They represent the relationship between a thermal zone and its associated cooling device.
|
||||
They are created/removed for each
|
||||
thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device successful execution.
|
||||
|
||||
|thermal_zone[0-*]
|
||||
|-----cdev[0-*]: The [0-*]th cooling device in the current thermal zone
|
||||
|-----cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with
|
||||
|
||||
|
||||
***************************
|
||||
* Thermal zone attributes *
|
||||
***************************
|
||||
|
||||
type Strings which represent the thermal zone type.
|
||||
This is given by thermal zone driver as part of registration.
|
||||
Eg: "ACPI thermal zone" indicates it's a ACPI thermal device
|
||||
RO
|
||||
Optional
|
||||
|
||||
temp Current temperature as reported by thermal zone (sensor)
|
||||
Unit: degree Celsius
|
||||
RO
|
||||
Required
|
||||
|
||||
mode One of the predefined values in [kernel, user]
|
||||
This file gives information about the algorithm
|
||||
that is currently managing the thermal zone.
|
||||
It can be either default kernel based algorithm
|
||||
or user space application.
|
||||
RW
|
||||
Optional
|
||||
kernel = Thermal management in kernel thermal zone driver.
|
||||
user = Preventing kernel thermal zone driver actions upon
|
||||
trip points so that user application can take full
|
||||
charge of the thermal management.
|
||||
|
||||
trip_point_[0-*]_temp The temperature above which trip point will be fired
|
||||
Unit: degree Celsius
|
||||
RO
|
||||
Optional
|
||||
|
||||
trip_point_[0-*]_type Strings which indicate the type of the trip point
|
||||
E.g. it can be one of critical, hot, passive,
|
||||
active[0-*] for ACPI thermal zone.
|
||||
RO
|
||||
Optional
|
||||
|
||||
cdev[0-*] Sysfs link to the thermal cooling device node where the sys I/F
|
||||
for cooling device throttling control represents.
|
||||
RO
|
||||
Optional
|
||||
|
||||
cdev[0-*]_trip_point The trip point with which cdev[0-*] is associated in this thermal zone
|
||||
-1 means the cooling device is not associated with any trip point.
|
||||
RO
|
||||
Optional
|
||||
|
||||
******************************
|
||||
* Cooling device attributes *
|
||||
******************************
|
||||
|
||||
type String which represents the type of device
|
||||
eg: For generic ACPI: this should be "Fan",
|
||||
"Processor" or "LCD"
|
||||
eg. For memory controller device on intel_menlow platform:
|
||||
this should be "Memory controller"
|
||||
RO
|
||||
Optional
|
||||
|
||||
max_state The maximum permissible cooling state of this cooling device.
|
||||
RO
|
||||
Required
|
||||
|
||||
cur_state The current cooling state of this cooling device.
|
||||
the value can any integer numbers between 0 and max_state,
|
||||
cur_state == 0 means no cooling
|
||||
cur_state == max_state means the maximum cooling.
|
||||
RW
|
||||
Required
|
||||
|
||||
3. A simple implementation
|
||||
|
||||
ACPI thermal zone may support multiple trip points like critical/hot/passive/active.
|
||||
If an ACPI thermal zone supports critical, passive, active[0] and active[1] at the same time,
|
||||
it may register itself as a thermal_zone_device (thermal_zone1) with 4 trip points in all.
|
||||
It has one processor and one fan, which are both registered as thermal_cooling_device.
|
||||
If the processor is listed in _PSL method, and the fan is listed in _AL0 method,
|
||||
the sys I/F structure will be built like this:
|
||||
|
||||
/sys/class/thermal:
|
||||
|
||||
|thermal_zone1:
|
||||
|-----type: ACPI thermal zone
|
||||
|-----temp: 37
|
||||
|-----mode: kernel
|
||||
|-----trip_point_0_temp: 100
|
||||
|-----trip_point_0_type: critical
|
||||
|-----trip_point_1_temp: 80
|
||||
|-----trip_point_1_type: passive
|
||||
|-----trip_point_2_temp: 70
|
||||
|-----trip_point_2_type: active[0]
|
||||
|-----trip_point_3_temp: 60
|
||||
|-----trip_point_3_type: active[1]
|
||||
|-----cdev0: --->/sys/class/thermal/cooling_device0
|
||||
|-----cdev0_trip_point: 1 /* cdev0 can be used for passive */
|
||||
|-----cdev1: --->/sys/class/thermal/cooling_device3
|
||||
|-----cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/
|
||||
|
||||
|cooling_device0:
|
||||
|-----type: Processor
|
||||
|-----max_state: 8
|
||||
|-----cur_state: 0
|
||||
|
||||
|cooling_device3:
|
||||
|-----type: Fan
|
||||
|-----max_state: 2
|
||||
|-----cur_state: 0
|
|
@ -0,0 +1,226 @@
|
|||
UNALIGNED MEMORY ACCESSES
|
||||
=========================
|
||||
|
||||
Linux runs on a wide variety of architectures which have varying behaviour
|
||||
when it comes to memory access. This document presents some details about
|
||||
unaligned accesses, why you need to write code that doesn't cause them,
|
||||
and how to write such code!
|
||||
|
||||
|
||||
The definition of an unaligned access
|
||||
=====================================
|
||||
|
||||
Unaligned memory accesses occur when you try to read N bytes of data starting
|
||||
from an address that is not evenly divisible by N (i.e. addr % N != 0).
|
||||
For example, reading 4 bytes of data from address 0x10004 is fine, but
|
||||
reading 4 bytes of data from address 0x10005 would be an unaligned memory
|
||||
access.
|
||||
|
||||
The above may seem a little vague, as memory access can happen in different
|
||||
ways. The context here is at the machine code level: certain instructions read
|
||||
or write a number of bytes to or from memory (e.g. movb, movw, movl in x86
|
||||
assembly). As will become clear, it is relatively easy to spot C statements
|
||||
which will compile to multiple-byte memory access instructions, namely when
|
||||
dealing with types such as u16, u32 and u64.
|
||||
|
||||
|
||||
Natural alignment
|
||||
=================
|
||||
|
||||
The rule mentioned above forms what we refer to as natural alignment:
|
||||
When accessing N bytes of memory, the base memory address must be evenly
|
||||
divisible by N, i.e. addr % N == 0.
|
||||
|
||||
When writing code, assume the target architecture has natural alignment
|
||||
requirements.
|
||||
|
||||
In reality, only a few architectures require natural alignment on all sizes
|
||||
of memory access. However, we must consider ALL supported architectures;
|
||||
writing code that satisfies natural alignment requirements is the easiest way
|
||||
to achieve full portability.
|
||||
|
||||
|
||||
Why unaligned access is bad
|
||||
===========================
|
||||
|
||||
The effects of performing an unaligned memory access vary from architecture
|
||||
to architecture. It would be easy to write a whole document on the differences
|
||||
here; a summary of the common scenarios is presented below:
|
||||
|
||||
- Some architectures are able to perform unaligned memory accesses
|
||||
transparently, but there is usually a significant performance cost.
|
||||
- Some architectures raise processor exceptions when unaligned accesses
|
||||
happen. The exception handler is able to correct the unaligned access,
|
||||
at significant cost to performance.
|
||||
- Some architectures raise processor exceptions when unaligned accesses
|
||||
happen, but the exceptions do not contain enough information for the
|
||||
unaligned access to be corrected.
|
||||
- Some architectures are not capable of unaligned memory access, but will
|
||||
silently perform a different memory access to the one that was requested,
|
||||
resulting a a subtle code bug that is hard to detect!
|
||||
|
||||
It should be obvious from the above that if your code causes unaligned
|
||||
memory accesses to happen, your code will not work correctly on certain
|
||||
platforms and will cause performance problems on others.
|
||||
|
||||
|
||||
Code that does not cause unaligned access
|
||||
=========================================
|
||||
|
||||
At first, the concepts above may seem a little hard to relate to actual
|
||||
coding practice. After all, you don't have a great deal of control over
|
||||
memory addresses of certain variables, etc.
|
||||
|
||||
Fortunately things are not too complex, as in most cases, the compiler
|
||||
ensures that things will work for you. For example, take the following
|
||||
structure:
|
||||
|
||||
struct foo {
|
||||
u16 field1;
|
||||
u32 field2;
|
||||
u8 field3;
|
||||
};
|
||||
|
||||
Let us assume that an instance of the above structure resides in memory
|
||||
starting at address 0x10000. With a basic level of understanding, it would
|
||||
not be unreasonable to expect that accessing field2 would cause an unaligned
|
||||
access. You'd be expecting field2 to be located at offset 2 bytes into the
|
||||
structure, i.e. address 0x10002, but that address is not evenly divisible
|
||||
by 4 (remember, we're reading a 4 byte value here).
|
||||
|
||||
Fortunately, the compiler understands the alignment constraints, so in the
|
||||
above case it would insert 2 bytes of padding in between field1 and field2.
|
||||
Therefore, for standard structure types you can always rely on the compiler
|
||||
to pad structures so that accesses to fields are suitably aligned (assuming
|
||||
you do not cast the field to a type of different length).
|
||||
|
||||
Similarly, you can also rely on the compiler to align variables and function
|
||||
parameters to a naturally aligned scheme, based on the size of the type of
|
||||
the variable.
|
||||
|
||||
At this point, it should be clear that accessing a single byte (u8 or char)
|
||||
will never cause an unaligned access, because all memory addresses are evenly
|
||||
divisible by one.
|
||||
|
||||
On a related topic, with the above considerations in mind you may observe
|
||||
that you could reorder the fields in the structure in order to place fields
|
||||
where padding would otherwise be inserted, and hence reduce the overall
|
||||
resident memory size of structure instances. The optimal layout of the
|
||||
above example is:
|
||||
|
||||
struct foo {
|
||||
u32 field2;
|
||||
u16 field1;
|
||||
u8 field3;
|
||||
};
|
||||
|
||||
For a natural alignment scheme, the compiler would only have to add a single
|
||||
byte of padding at the end of the structure. This padding is added in order
|
||||
to satisfy alignment constraints for arrays of these structures.
|
||||
|
||||
Another point worth mentioning is the use of __attribute__((packed)) on a
|
||||
structure type. This GCC-specific attribute tells the compiler never to
|
||||
insert any padding within structures, useful when you want to use a C struct
|
||||
to represent some data that comes in a fixed arrangement 'off the wire'.
|
||||
|
||||
You might be inclined to believe that usage of this attribute can easily
|
||||
lead to unaligned accesses when accessing fields that do not satisfy
|
||||
architectural alignment requirements. However, again, the compiler is aware
|
||||
of the alignment constraints and will generate extra instructions to perform
|
||||
the memory access in a way that does not cause unaligned access. Of course,
|
||||
the extra instructions obviously cause a loss in performance compared to the
|
||||
non-packed case, so the packed attribute should only be used when avoiding
|
||||
structure padding is of importance.
|
||||
|
||||
|
||||
Code that causes unaligned access
|
||||
=================================
|
||||
|
||||
With the above in mind, let's move onto a real life example of a function
|
||||
that can cause an unaligned memory access. The following function adapted
|
||||
from include/linux/etherdevice.h is an optimized routine to compare two
|
||||
ethernet MAC addresses for equality.
|
||||
|
||||
unsigned int compare_ether_addr(const u8 *addr1, const u8 *addr2)
|
||||
{
|
||||
const u16 *a = (const u16 *) addr1;
|
||||
const u16 *b = (const u16 *) addr2;
|
||||
return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) != 0;
|
||||
}
|
||||
|
||||
In the above function, the reference to a[0] causes 2 bytes (16 bits) to
|
||||
be read from memory starting at address addr1. Think about what would happen
|
||||
if addr1 was an odd address such as 0x10003. (Hint: it'd be an unaligned
|
||||
access.)
|
||||
|
||||
Despite the potential unaligned access problems with the above function, it
|
||||
is included in the kernel anyway but is understood to only work on
|
||||
16-bit-aligned addresses. It is up to the caller to ensure this alignment or
|
||||
not use this function at all. This alignment-unsafe function is still useful
|
||||
as it is a decent optimization for the cases when you can ensure alignment,
|
||||
which is true almost all of the time in ethernet networking context.
|
||||
|
||||
|
||||
Here is another example of some code that could cause unaligned accesses:
|
||||
void myfunc(u8 *data, u32 value)
|
||||
{
|
||||
[...]
|
||||
*((u32 *) data) = cpu_to_le32(value);
|
||||
[...]
|
||||
}
|
||||
|
||||
This code will cause unaligned accesses every time the data parameter points
|
||||
to an address that is not evenly divisible by 4.
|
||||
|
||||
In summary, the 2 main scenarios where you may run into unaligned access
|
||||
problems involve:
|
||||
1. Casting variables to types of different lengths
|
||||
2. Pointer arithmetic followed by access to at least 2 bytes of data
|
||||
|
||||
|
||||
Avoiding unaligned accesses
|
||||
===========================
|
||||
|
||||
The easiest way to avoid unaligned access is to use the get_unaligned() and
|
||||
put_unaligned() macros provided by the <asm/unaligned.h> header file.
|
||||
|
||||
Going back to an earlier example of code that potentially causes unaligned
|
||||
access:
|
||||
|
||||
void myfunc(u8 *data, u32 value)
|
||||
{
|
||||
[...]
|
||||
*((u32 *) data) = cpu_to_le32(value);
|
||||
[...]
|
||||
}
|
||||
|
||||
To avoid the unaligned memory access, you would rewrite it as follows:
|
||||
|
||||
void myfunc(u8 *data, u32 value)
|
||||
{
|
||||
[...]
|
||||
value = cpu_to_le32(value);
|
||||
put_unaligned(value, (u32 *) data);
|
||||
[...]
|
||||
}
|
||||
|
||||
The get_unaligned() macro works similarly. Assuming 'data' is a pointer to
|
||||
memory and you wish to avoid unaligned access, its usage is as follows:
|
||||
|
||||
u32 value = get_unaligned((u32 *) data);
|
||||
|
||||
These macros work work for memory accesses of any length (not just 32 bits as
|
||||
in the examples above). Be aware that when compared to standard access of
|
||||
aligned memory, using these macros to access unaligned memory can be costly in
|
||||
terms of performance.
|
||||
|
||||
If use of such macros is not convenient, another option is to use memcpy(),
|
||||
where the source or destination (or both) are of type u8* or unsigned char*.
|
||||
Due to the byte-wise nature of this operation, unaligned accesses are avoided.
|
||||
|
||||
--
|
||||
Author: Daniel Drake <dsd@gentoo.org>
|
||||
With help from: Alan Cox, Avuton Olrich, Heikki Orsila, Jan Engelhardt,
|
||||
Johannes Berg, Kyle McMartin, Kyle Moffett, Randy Dunlap, Robert Hancock,
|
||||
Uli Kunitz, Vadim Lobanov
|
||||
|
|
@ -32,6 +32,13 @@ struct slabinfo {
|
|||
int sanity_checks, slab_size, store_user, trace;
|
||||
int order, poison, reclaim_account, red_zone;
|
||||
unsigned long partial, objects, slabs;
|
||||
unsigned long alloc_fastpath, alloc_slowpath;
|
||||
unsigned long free_fastpath, free_slowpath;
|
||||
unsigned long free_frozen, free_add_partial, free_remove_partial;
|
||||
unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
|
||||
unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
|
||||
unsigned long deactivate_to_head, deactivate_to_tail;
|
||||
unsigned long deactivate_remote_frees;
|
||||
int numa[MAX_NODES];
|
||||
int numa_partial[MAX_NODES];
|
||||
} slabinfo[MAX_SLABS];
|
||||
|
@ -64,8 +71,10 @@ int show_inverted = 0;
|
|||
int show_single_ref = 0;
|
||||
int show_totals = 0;
|
||||
int sort_size = 0;
|
||||
int sort_active = 0;
|
||||
int set_debug = 0;
|
||||
int show_ops = 0;
|
||||
int show_activity = 0;
|
||||
|
||||
/* Debug options */
|
||||
int sanity = 0;
|
||||
|
@ -93,8 +102,10 @@ void usage(void)
|
|||
printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n"
|
||||
"slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
|
||||
"-a|--aliases Show aliases\n"
|
||||
"-A|--activity Most active slabs first\n"
|
||||
"-d<options>|--debug=<options> Set/Clear Debug options\n"
|
||||
"-e|--empty Show empty slabs\n"
|
||||
"-D|--display-active Switch line format to activity\n"
|
||||
"-e|--empty Show empty slabs\n"
|
||||
"-f|--first-alias Show first alias\n"
|
||||
"-h|--help Show usage information\n"
|
||||
"-i|--inverted Inverted list\n"
|
||||
|
@ -281,8 +292,11 @@ int line = 0;
|
|||
|
||||
void first_line(void)
|
||||
{
|
||||
printf("Name Objects Objsize Space "
|
||||
"Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
|
||||
if (show_activity)
|
||||
printf("Name Objects Alloc Free %%Fast\n");
|
||||
else
|
||||
printf("Name Objects Objsize Space "
|
||||
"Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -309,6 +323,12 @@ unsigned long slab_size(struct slabinfo *s)
|
|||
return s->slabs * (page_size << s->order);
|
||||
}
|
||||
|
||||
unsigned long slab_activity(struct slabinfo *s)
|
||||
{
|
||||
return s->alloc_fastpath + s->free_fastpath +
|
||||
s->alloc_slowpath + s->free_slowpath;
|
||||
}
|
||||
|
||||
void slab_numa(struct slabinfo *s, int mode)
|
||||
{
|
||||
int node;
|
||||
|
@ -392,6 +412,71 @@ const char *onoff(int x)
|
|||
return "Off";
|
||||
}
|
||||
|
||||
void slab_stats(struct slabinfo *s)
|
||||
{
|
||||
unsigned long total_alloc;
|
||||
unsigned long total_free;
|
||||
unsigned long total;
|
||||
|
||||
if (!s->alloc_slab)
|
||||
return;
|
||||
|
||||
total_alloc = s->alloc_fastpath + s->alloc_slowpath;
|
||||
total_free = s->free_fastpath + s->free_slowpath;
|
||||
|
||||
if (!total_alloc)
|
||||
return;
|
||||
|
||||
printf("\n");
|
||||
printf("Slab Perf Counter Alloc Free %%Al %%Fr\n");
|
||||
printf("--------------------------------------------------\n");
|
||||
printf("Fastpath %8lu %8lu %3lu %3lu\n",
|
||||
s->alloc_fastpath, s->free_fastpath,
|
||||
s->alloc_fastpath * 100 / total_alloc,
|
||||
s->free_fastpath * 100 / total_free);
|
||||
printf("Slowpath %8lu %8lu %3lu %3lu\n",
|
||||
total_alloc - s->alloc_fastpath, s->free_slowpath,
|
||||
(total_alloc - s->alloc_fastpath) * 100 / total_alloc,
|
||||
s->free_slowpath * 100 / total_free);
|
||||
printf("Page Alloc %8lu %8lu %3lu %3lu\n",
|
||||
s->alloc_slab, s->free_slab,
|
||||
s->alloc_slab * 100 / total_alloc,
|
||||
s->free_slab * 100 / total_free);
|
||||
printf("Add partial %8lu %8lu %3lu %3lu\n",
|
||||
s->deactivate_to_head + s->deactivate_to_tail,
|
||||
s->free_add_partial,
|
||||
(s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
|
||||
s->free_add_partial * 100 / total_free);
|
||||
printf("Remove partial %8lu %8lu %3lu %3lu\n",
|
||||
s->alloc_from_partial, s->free_remove_partial,
|
||||
s->alloc_from_partial * 100 / total_alloc,
|
||||
s->free_remove_partial * 100 / total_free);
|
||||
|
||||
printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
|
||||
s->deactivate_remote_frees, s->free_frozen,
|
||||
s->deactivate_remote_frees * 100 / total_alloc,
|
||||
s->free_frozen * 100 / total_free);
|
||||
|
||||
printf("Total %8lu %8lu\n\n", total_alloc, total_free);
|
||||
|
||||
if (s->cpuslab_flush)
|
||||
printf("Flushes %8lu\n", s->cpuslab_flush);
|
||||
|
||||
if (s->alloc_refill)
|
||||
printf("Refill %8lu\n", s->alloc_refill);
|
||||
|
||||
total = s->deactivate_full + s->deactivate_empty +
|
||||
s->deactivate_to_head + s->deactivate_to_tail;
|
||||
|
||||
if (total)
|
||||
printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) "
|
||||
"ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n",
|
||||
s->deactivate_full, (s->deactivate_full * 100) / total,
|
||||
s->deactivate_empty, (s->deactivate_empty * 100) / total,
|
||||
s->deactivate_to_head, (s->deactivate_to_head * 100) / total,
|
||||
s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
|
||||
}
|
||||
|
||||
void report(struct slabinfo *s)
|
||||
{
|
||||
if (strcmp(s->name, "*") == 0)
|
||||
|
@ -430,6 +515,7 @@ void report(struct slabinfo *s)
|
|||
ops(s);
|
||||
show_tracking(s);
|
||||
slab_numa(s, 1);
|
||||
slab_stats(s);
|
||||
}
|
||||
|
||||
void slabcache(struct slabinfo *s)
|
||||
|
@ -479,13 +565,27 @@ void slabcache(struct slabinfo *s)
|
|||
*p++ = 'T';
|
||||
|
||||
*p = 0;
|
||||
printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
|
||||
s->name, s->objects, s->object_size, size_str, dist_str,
|
||||
s->objs_per_slab, s->order,
|
||||
s->slabs ? (s->partial * 100) / s->slabs : 100,
|
||||
s->slabs ? (s->objects * s->object_size * 100) /
|
||||
(s->slabs * (page_size << s->order)) : 100,
|
||||
flags);
|
||||
if (show_activity) {
|
||||
unsigned long total_alloc;
|
||||
unsigned long total_free;
|
||||
|
||||
total_alloc = s->alloc_fastpath + s->alloc_slowpath;
|
||||
total_free = s->free_fastpath + s->free_slowpath;
|
||||
|
||||
printf("%-21s %8ld %8ld %8ld %3ld %3ld \n",
|
||||
s->name, s->objects,
|
||||
total_alloc, total_free,
|
||||
total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
|
||||
total_free ? (s->free_fastpath * 100 / total_free) : 0);
|
||||
}
|
||||
else
|
||||
printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
|
||||
s->name, s->objects, s->object_size, size_str, dist_str,
|
||||
s->objs_per_slab, s->order,
|
||||
s->slabs ? (s->partial * 100) / s->slabs : 100,
|
||||
s->slabs ? (s->objects * s->object_size * 100) /
|
||||
(s->slabs * (page_size << s->order)) : 100,
|
||||
flags);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -892,6 +992,8 @@ void sort_slabs(void)
|
|||
|
||||
if (sort_size)
|
||||
result = slab_size(s1) < slab_size(s2);
|
||||
else if (sort_active)
|
||||
result = slab_activity(s1) < slab_activity(s2);
|
||||
else
|
||||
result = strcasecmp(s1->name, s2->name);
|
||||
|
||||
|
@ -1074,6 +1176,23 @@ void read_slab_dir(void)
|
|||
free(t);
|
||||
slab->store_user = get_obj("store_user");
|
||||
slab->trace = get_obj("trace");
|
||||
slab->alloc_fastpath = get_obj("alloc_fastpath");
|
||||
slab->alloc_slowpath = get_obj("alloc_slowpath");
|
||||
slab->free_fastpath = get_obj("free_fastpath");
|
||||
slab->free_slowpath = get_obj("free_slowpath");
|
||||
slab->free_frozen= get_obj("free_frozen");
|
||||
slab->free_add_partial = get_obj("free_add_partial");
|
||||
slab->free_remove_partial = get_obj("free_remove_partial");
|
||||
slab->alloc_from_partial = get_obj("alloc_from_partial");
|
||||
slab->alloc_slab = get_obj("alloc_slab");
|
||||
slab->alloc_refill = get_obj("alloc_refill");
|
||||
slab->free_slab = get_obj("free_slab");
|
||||
slab->cpuslab_flush = get_obj("cpuslab_flush");
|
||||
slab->deactivate_full = get_obj("deactivate_full");
|
||||
slab->deactivate_empty = get_obj("deactivate_empty");
|
||||
slab->deactivate_to_head = get_obj("deactivate_to_head");
|
||||
slab->deactivate_to_tail = get_obj("deactivate_to_tail");
|
||||
slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
|
||||
chdir("..");
|
||||
if (slab->name[0] == ':')
|
||||
alias_targets++;
|
||||
|
@ -1124,7 +1243,9 @@ void output_slabs(void)
|
|||
|
||||
struct option opts[] = {
|
||||
{ "aliases", 0, NULL, 'a' },
|
||||
{ "activity", 0, NULL, 'A' },
|
||||
{ "debug", 2, NULL, 'd' },
|
||||
{ "display-activity", 0, NULL, 'D' },
|
||||
{ "empty", 0, NULL, 'e' },
|
||||
{ "first-alias", 0, NULL, 'f' },
|
||||
{ "help", 0, NULL, 'h' },
|
||||
|
@ -1149,7 +1270,7 @@ int main(int argc, char *argv[])
|
|||
|
||||
page_size = getpagesize();
|
||||
|
||||
while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
|
||||
while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
|
||||
opts, NULL)) != -1)
|
||||
switch (c) {
|
||||
case '1':
|
||||
|
@ -1158,11 +1279,17 @@ int main(int argc, char *argv[])
|
|||
case 'a':
|
||||
show_alias = 1;
|
||||
break;
|
||||
case 'A':
|
||||
sort_active = 1;
|
||||
break;
|
||||
case 'd':
|
||||
set_debug = 1;
|
||||
if (!debug_opt_scan(optarg))
|
||||
fatal("Invalid debug option '%s'\n", optarg);
|
||||
break;
|
||||
case 'D':
|
||||
show_activity = 1;
|
||||
break;
|
||||
case 'e':
|
||||
show_empty = 1;
|
||||
break;
|
||||
|
|
|
@ -4,3 +4,5 @@ ds2482
|
|||
- The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
|
||||
ds2490
|
||||
- The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
|
||||
w1-gpio
|
||||
- GPIO 1-wire bus master driver.
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
Kernel driver w1-gpio
|
||||
=====================
|
||||
|
||||
Author: Ville Syrjala <syrjala@sci.fi>
|
||||
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
GPIO 1-wire bus master driver. The driver uses the GPIO API to control the
|
||||
wire and the GPIO pin can be specified using platform data.
|
||||
|
||||
|
||||
Example (mach-at91)
|
||||
-------------------
|
||||
|
||||
#include <linux/w1-gpio.h>
|
||||
|
||||
static struct w1_gpio_platform_data foo_w1_gpio_pdata = {
|
||||
.pin = AT91_PIN_PB20,
|
||||
.is_open_drain = 1,
|
||||
};
|
||||
|
||||
static struct platform_device foo_w1_device = {
|
||||
.name = "w1-gpio",
|
||||
.id = -1,
|
||||
.dev.platform_data = &foo_w1_gpio_pdata,
|
||||
};
|
||||
|
||||
...
|
||||
at91_set_GPIO_periph(foo_w1_gpio_pdata.pin, 1);
|
||||
at91_set_multi_drive(foo_w1_gpio_pdata.pin, 1);
|
||||
platform_device_register(&foo_w1_device);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue