[PATCH] per-task-delay-accounting: documentation
Some documentation for delay accounting. Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com> Signed-off-by: Balbir Singh <balbir@in.ibm.com> Cc: Jes Sorensen <jes@sgi.com> Cc: Peter Chubb <peterc@gelato.unsw.edu.au> Cc: Erich Focht <efocht@ess.nec.de> Cc: Levent Serinol <lserinol@gmail.com> Cc: Jay Lan <jlan@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
6f44993fe1
commit
a3baf649ca
|
@ -0,0 +1,115 @@
|
|||
Delay accounting
|
||||
----------------
|
||||
|
||||
Tasks encounter delays in execution when they wait
|
||||
for some kernel resource to become available e.g. a
|
||||
runnable task may wait for a free CPU to run on.
|
||||
|
||||
The per-task delay accounting functionality measures
|
||||
the delays experienced by a task while
|
||||
|
||||
a) waiting for a CPU (while being runnable)
|
||||
b) completion of synchronous block I/O initiated by the task
|
||||
c) swapping in pages
|
||||
|
||||
and makes these statistics available to userspace through
|
||||
the taskstats interface.
|
||||
|
||||
Such delays provide feedback for setting a task's cpu priority,
|
||||
io priority and rss limit values appropriately. Long delays for
|
||||
important tasks could be a trigger for raising its corresponding priority.
|
||||
|
||||
The functionality, through its use of the taskstats interface, also provides
|
||||
delay statistics aggregated for all tasks (or threads) belonging to a
|
||||
thread group (corresponding to a traditional Unix process). This is a commonly
|
||||
needed aggregation that is more efficiently done by the kernel.
|
||||
|
||||
Userspace utilities, particularly resource management applications, can also
|
||||
aggregate delay statistics into arbitrary groups. To enable this, delay
|
||||
statistics of a task are available both during its lifetime as well as on its
|
||||
exit, ensuring continuous and complete monitoring can be done.
|
||||
|
||||
|
||||
Interface
|
||||
---------
|
||||
|
||||
Delay accounting uses the taskstats interface which is described
|
||||
in detail in a separate document in this directory. Taskstats returns a
|
||||
generic data structure to userspace corresponding to per-pid and per-tgid
|
||||
statistics. The delay accounting functionality populates specific fields of
|
||||
this structure. See
|
||||
include/linux/taskstats.h
|
||||
for a description of the fields pertaining to delay accounting.
|
||||
It will generally be in the form of counters returning the cumulative
|
||||
delay seen for cpu, sync block I/O, swapin etc.
|
||||
|
||||
Taking the difference of two successive readings of a given
|
||||
counter (say cpu_delay_total) for a task will give the delay
|
||||
experienced by the task waiting for the corresponding resource
|
||||
in that interval.
|
||||
|
||||
When a task exits, records containing the per-task and per-process statistics
|
||||
are sent to userspace without requiring a command. More details are given in
|
||||
the taskstats interface description.
|
||||
|
||||
The getdelays.c userspace utility in this directory allows simple commands to
|
||||
be run and the corresponding delay statistics to be displayed. It also serves
|
||||
as an example of using the taskstats interface.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Compile the kernel with
|
||||
CONFIG_TASK_DELAY_ACCT=y
|
||||
CONFIG_TASKSTATS=y
|
||||
|
||||
Enable the accounting at boot time by adding
|
||||
the following to the kernel boot options
|
||||
delayacct
|
||||
|
||||
and after the system has booted up, use a utility
|
||||
similar to getdelays.c to access the delays
|
||||
seen by a given task or a task group (tgid).
|
||||
The utility also allows a given command to be
|
||||
executed and the corresponding delays to be
|
||||
seen.
|
||||
|
||||
General format of the getdelays command
|
||||
|
||||
getdelays [-t tgid] [-p pid] [-c cmd...]
|
||||
|
||||
|
||||
Get delays, since system boot, for pid 10
|
||||
# ./getdelays -p 10
|
||||
(output similar to next case)
|
||||
|
||||
Get sum of delays, since system boot, for all pids with tgid 5
|
||||
# ./getdelays -t 5
|
||||
|
||||
|
||||
CPU count real total virtual total delay total
|
||||
7876 92005750 100000000 24001500
|
||||
IO count delay total
|
||||
0 0
|
||||
MEM count delay total
|
||||
0 0
|
||||
|
||||
Get delays seen in executing a given simple command
|
||||
# ./getdelays -c ls /
|
||||
|
||||
bin data1 data3 data5 dev home media opt root srv sys usr
|
||||
boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var
|
||||
|
||||
|
||||
CPU count real total virtual total delay total
|
||||
6 4000250 4000000 0
|
||||
IO count delay total
|
||||
0 0
|
||||
MEM count delay total
|
||||
0 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,376 @@
|
|||
/* getdelays.c
|
||||
*
|
||||
* Utility to get per-pid and per-tgid delay accounting statistics
|
||||
* Also illustrates usage of the taskstats interface
|
||||
*
|
||||
* Copyright (C) Shailabh Nagar, IBM Corp. 2005
|
||||
* Copyright (C) Balbir Singh, IBM Corp. 2006
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <poll.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include <linux/genetlink.h>
|
||||
#include <linux/taskstats.h>
|
||||
|
||||
/*
|
||||
* Generic macros for dealing with netlink sockets. Might be duplicated
|
||||
* elsewhere. It is recommended that commercial grade applications use
|
||||
* libnl or libnetlink and use the interfaces provided by the library
|
||||
*/
|
||||
#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
|
||||
#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
|
||||
#define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN))
|
||||
#define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
|
||||
|
||||
#define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
|
||||
int done = 0;
|
||||
|
||||
/*
|
||||
* Create a raw netlink socket and bind
|
||||
*/
|
||||
static int create_nl_socket(int protocol, int groups)
|
||||
{
|
||||
socklen_t addr_len;
|
||||
int fd;
|
||||
struct sockaddr_nl local;
|
||||
|
||||
fd = socket(AF_NETLINK, SOCK_RAW, protocol);
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
memset(&local, 0, sizeof(local));
|
||||
local.nl_family = AF_NETLINK;
|
||||
local.nl_groups = groups;
|
||||
|
||||
if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
|
||||
goto error;
|
||||
|
||||
return fd;
|
||||
error:
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sendto_fd(int s, const char *buf, int bufLen)
|
||||
{
|
||||
struct sockaddr_nl nladdr;
|
||||
int r;
|
||||
|
||||
memset(&nladdr, 0, sizeof(nladdr));
|
||||
nladdr.nl_family = AF_NETLINK;
|
||||
|
||||
while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr,
|
||||
sizeof(nladdr))) < bufLen) {
|
||||
if (r > 0) {
|
||||
buf += r;
|
||||
bufLen -= r;
|
||||
} else if (errno != EAGAIN)
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Probe the controller in genetlink to find the family id
|
||||
* for the TASKSTATS family
|
||||
*/
|
||||
int get_family_id(int sd)
|
||||
{
|
||||
struct {
|
||||
struct nlmsghdr n;
|
||||
struct genlmsghdr g;
|
||||
char buf[256];
|
||||
} family_req;
|
||||
struct {
|
||||
struct nlmsghdr n;
|
||||
struct genlmsghdr g;
|
||||
char buf[256];
|
||||
} ans;
|
||||
|
||||
int id;
|
||||
struct nlattr *na;
|
||||
int rep_len;
|
||||
|
||||
/* Get family name */
|
||||
family_req.n.nlmsg_type = GENL_ID_CTRL;
|
||||
family_req.n.nlmsg_flags = NLM_F_REQUEST;
|
||||
family_req.n.nlmsg_seq = 0;
|
||||
family_req.n.nlmsg_pid = getpid();
|
||||
family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
|
||||
family_req.g.cmd = CTRL_CMD_GETFAMILY;
|
||||
family_req.g.version = 0x1;
|
||||
na = (struct nlattr *) GENLMSG_DATA(&family_req);
|
||||
na->nla_type = CTRL_ATTR_FAMILY_NAME;
|
||||
na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
|
||||
strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
|
||||
family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
|
||||
|
||||
if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
|
||||
err(1, "error sending message via Netlink\n");
|
||||
|
||||
rep_len = recv(sd, &ans, sizeof(ans), 0);
|
||||
|
||||
if (rep_len < 0)
|
||||
err(1, "error receiving reply message via Netlink\n");
|
||||
|
||||
|
||||
/* Validate response message */
|
||||
if (!NLMSG_OK((&ans.n), rep_len))
|
||||
err(1, "invalid reply message received via Netlink\n");
|
||||
|
||||
if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
|
||||
printf("error received NACK - leaving\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
na = (struct nlattr *) GENLMSG_DATA(&ans);
|
||||
na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
|
||||
if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
|
||||
id = *(__u16 *) NLA_DATA(na);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
||||
void print_taskstats(struct taskstats *t)
|
||||
{
|
||||
printf("\n\nCPU %15s%15s%15s%15s\n"
|
||||
" %15llu%15llu%15llu%15llu\n"
|
||||
"IO %15s%15s\n"
|
||||
" %15llu%15llu\n"
|
||||
"MEM %15s%15s\n"
|
||||
" %15llu%15llu\n\n",
|
||||
"count", "real total", "virtual total", "delay total",
|
||||
t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
|
||||
t->cpu_delay_total,
|
||||
"count", "delay total",
|
||||
t->blkio_count, t->blkio_delay_total,
|
||||
"count", "delay total", t->swapin_count, t->swapin_delay_total);
|
||||
}
|
||||
|
||||
void sigchld(int sig)
|
||||
{
|
||||
done = 1;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int rc;
|
||||
int sk_nl;
|
||||
struct nlmsghdr *nlh;
|
||||
struct genlmsghdr *genlhdr;
|
||||
char *buf;
|
||||
struct taskstats_cmd_param *param;
|
||||
__u16 id;
|
||||
struct nlattr *na;
|
||||
|
||||
/* For receiving */
|
||||
struct sockaddr_nl kern_nla, from_nla;
|
||||
socklen_t from_nla_len;
|
||||
int recv_len;
|
||||
struct taskstats_reply *reply;
|
||||
|
||||
struct {
|
||||
struct nlmsghdr n;
|
||||
struct genlmsghdr g;
|
||||
char buf[256];
|
||||
} req;
|
||||
|
||||
struct {
|
||||
struct nlmsghdr n;
|
||||
struct genlmsghdr g;
|
||||
char buf[256];
|
||||
} ans;
|
||||
|
||||
int nl_sd = -1;
|
||||
int rep_len;
|
||||
int len = 0;
|
||||
int aggr_len, len2;
|
||||
struct sockaddr_nl nladdr;
|
||||
pid_t tid = 0;
|
||||
pid_t rtid = 0;
|
||||
int cmd_type = TASKSTATS_TYPE_TGID;
|
||||
int c, status;
|
||||
int forking = 0;
|
||||
struct sigaction act = {
|
||||
.sa_handler = SIG_IGN,
|
||||
.sa_mask = SA_NOMASK,
|
||||
};
|
||||
struct sigaction tact ;
|
||||
|
||||
if (argc < 3) {
|
||||
printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
tact.sa_handler = sigchld;
|
||||
sigemptyset(&tact.sa_mask);
|
||||
if (sigaction(SIGCHLD, &tact, NULL) < 0)
|
||||
err(1, "sigaction failed for SIGCHLD\n");
|
||||
|
||||
while (1) {
|
||||
|
||||
c = getopt(argc, argv, "t:p:c:");
|
||||
if (c < 0)
|
||||
break;
|
||||
|
||||
switch (c) {
|
||||
case 't':
|
||||
tid = atoi(optarg);
|
||||
if (!tid)
|
||||
err(1, "Invalid tgid\n");
|
||||
cmd_type = TASKSTATS_CMD_ATTR_TGID;
|
||||
break;
|
||||
case 'p':
|
||||
tid = atoi(optarg);
|
||||
if (!tid)
|
||||
err(1, "Invalid pid\n");
|
||||
cmd_type = TASKSTATS_CMD_ATTR_TGID;
|
||||
break;
|
||||
case 'c':
|
||||
opterr = 0;
|
||||
tid = fork();
|
||||
if (tid < 0)
|
||||
err(1, "fork failed\n");
|
||||
|
||||
if (tid == 0) { /* child process */
|
||||
if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
forking = 1;
|
||||
break;
|
||||
default:
|
||||
printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
|
||||
exit(-1);
|
||||
break;
|
||||
}
|
||||
if (c == 'c')
|
||||
break;
|
||||
}
|
||||
|
||||
/* Construct Netlink request message */
|
||||
|
||||
/* Send Netlink request message & get reply */
|
||||
|
||||
if ((nl_sd =
|
||||
create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
|
||||
err(1, "error creating Netlink socket\n");
|
||||
|
||||
|
||||
id = get_family_id(nl_sd);
|
||||
|
||||
/* Send command needed */
|
||||
req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
|
||||
req.n.nlmsg_type = id;
|
||||
req.n.nlmsg_flags = NLM_F_REQUEST;
|
||||
req.n.nlmsg_seq = 0;
|
||||
req.n.nlmsg_pid = tid;
|
||||
req.g.cmd = TASKSTATS_CMD_GET;
|
||||
na = (struct nlattr *) GENLMSG_DATA(&req);
|
||||
na->nla_type = cmd_type;
|
||||
na->nla_len = sizeof(unsigned int) + NLA_HDRLEN;
|
||||
*(__u32 *) NLA_DATA(na) = tid;
|
||||
req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
|
||||
|
||||
|
||||
if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
|
||||
err(1, "error sending message via Netlink\n");
|
||||
|
||||
act.sa_handler = SIG_IGN;
|
||||
sigemptyset(&act.sa_mask);
|
||||
if (sigaction(SIGINT, &act, NULL) < 0)
|
||||
err(1, "sigaction failed for SIGINT\n");
|
||||
|
||||
do {
|
||||
int i;
|
||||
struct pollfd pfd;
|
||||
int pollres;
|
||||
|
||||
pfd.events = 0xffff & ~POLLOUT;
|
||||
pfd.fd = nl_sd;
|
||||
pollres = poll(&pfd, 1, 5000);
|
||||
if (pollres < 0 || done) {
|
||||
break;
|
||||
}
|
||||
|
||||
rep_len = recv(nl_sd, &ans, sizeof(ans), 0);
|
||||
nladdr.nl_family = AF_NETLINK;
|
||||
nladdr.nl_groups = TASKSTATS_LISTEN_GROUP;
|
||||
|
||||
if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
|
||||
printf("error received NACK - leaving\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (rep_len < 0) {
|
||||
err(1, "error receiving reply message via Netlink\n");
|
||||
break;
|
||||
}
|
||||
|
||||
/* Validate response message */
|
||||
if (!NLMSG_OK((&ans.n), rep_len))
|
||||
err(1, "invalid reply message received via Netlink\n");
|
||||
|
||||
rep_len = GENLMSG_PAYLOAD(&ans.n);
|
||||
|
||||
na = (struct nlattr *) GENLMSG_DATA(&ans);
|
||||
len = 0;
|
||||
i = 0;
|
||||
while (len < rep_len) {
|
||||
len += NLA_ALIGN(na->nla_len);
|
||||
switch (na->nla_type) {
|
||||
case TASKSTATS_TYPE_AGGR_PID:
|
||||
/* Fall through */
|
||||
case TASKSTATS_TYPE_AGGR_TGID:
|
||||
aggr_len = NLA_PAYLOAD(na->nla_len);
|
||||
len2 = 0;
|
||||
/* For nested attributes, na follows */
|
||||
na = (struct nlattr *) NLA_DATA(na);
|
||||
done = 0;
|
||||
while (len2 < aggr_len) {
|
||||
switch (na->nla_type) {
|
||||
case TASKSTATS_TYPE_PID:
|
||||
rtid = *(int *) NLA_DATA(na);
|
||||
break;
|
||||
case TASKSTATS_TYPE_TGID:
|
||||
rtid = *(int *) NLA_DATA(na);
|
||||
break;
|
||||
case TASKSTATS_TYPE_STATS:
|
||||
if (rtid == tid) {
|
||||
print_taskstats((struct taskstats *)
|
||||
NLA_DATA(na));
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
len2 += NLA_ALIGN(na->nla_len);
|
||||
na = (struct nlattr *) ((char *) na + len2);
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
}
|
||||
na = (struct nlattr *) (GENLMSG_DATA(&ans) + len);
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
while (1);
|
||||
|
||||
close(nl_sd);
|
||||
return 0;
|
||||
}
|
|
@ -39,6 +39,8 @@ belongs (the task does not need to be the thread group leader). The need for
|
|||
per-tgid stats to be sent for each exiting task is explained in the per-tgid
|
||||
stats section below.
|
||||
|
||||
getdelays.c is a simple utility demonstrating usage of the taskstats interface
|
||||
for reporting delay accounting statistics.
|
||||
|
||||
Interface
|
||||
---------
|
||||
|
|
Loading…
Reference in New Issue