linux-sg2042/include/linux/fscache-cache.h

555 lines
18 KiB
C
Raw Normal View History

/* General filesystem caching backing cache interface
*
* Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* NOTE!!! See:
*
* Documentation/filesystems/caching/backend-api.txt
*
* for a description of the cache backend interface declared here.
*/
#ifndef _LINUX_FSCACHE_CACHE_H
#define _LINUX_FSCACHE_CACHE_H
#include <linux/fscache.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#define NR_MAXCACHES BITS_PER_LONG
struct fscache_cache;
struct fscache_cache_ops;
struct fscache_object;
struct fscache_operation;
/*
* cache tag definition
*/
struct fscache_cache_tag {
struct list_head link;
struct fscache_cache *cache; /* cache referred to by this tag */
unsigned long flags;
#define FSCACHE_TAG_RESERVED 0 /* T if tag is reserved for a cache */
atomic_t usage;
char name[0]; /* tag name */
};
/*
* cache definition
*/
struct fscache_cache {
const struct fscache_cache_ops *ops;
struct fscache_cache_tag *tag; /* tag representing this cache */
struct kobject *kobj; /* system representation of this cache */
struct list_head link; /* link in list of caches */
size_t max_index_size; /* maximum size of index data */
char identifier[36]; /* cache label */
/* node management */
struct work_struct op_gc; /* operation garbage collector */
struct list_head object_list; /* list of data/index objects */
struct list_head op_gc_list; /* list of ops to be deleted */
spinlock_t object_list_lock;
spinlock_t op_gc_list_lock;
atomic_t object_count; /* no. of live objects in this cache */
struct fscache_object *fsdef; /* object for the fsdef index */
unsigned long flags;
#define FSCACHE_IOERROR 0 /* cache stopped on I/O error */
#define FSCACHE_CACHE_WITHDRAWN 1 /* cache has been withdrawn */
};
extern wait_queue_head_t fscache_cache_cleared_wq;
/*
* operation to be applied to a cache object
* - retrieval initiation operations are done in the context of the process
* that issued them, and not in an async thread pool
*/
typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
enum fscache_operation_state {
FSCACHE_OP_ST_BLANK, /* Op is not yet submitted */
FSCACHE_OP_ST_INITIALISED, /* Op is initialised */
FSCACHE_OP_ST_PENDING, /* Op is blocked from running */
FSCACHE_OP_ST_IN_PROGRESS, /* Op is in progress */
FSCACHE_OP_ST_COMPLETE, /* Op is complete */
FSCACHE_OP_ST_CANCELLED, /* Op has been cancelled */
FSCACHE_OP_ST_DEAD /* Op is now dead */
};
struct fscache_operation {
struct work_struct work; /* record for async ops */
struct list_head pend_link; /* link in object->pending_ops */
struct fscache_object *object; /* object to be operated upon */
unsigned long flags;
#define FSCACHE_OP_TYPE 0x000f /* operation type */
#define FSCACHE_OP_ASYNC 0x0001 /* - async op, processor may sleep for disk */
#define FSCACHE_OP_MYTHREAD 0x0002 /* - processing is done be issuing thread, not pool */
#define FSCACHE_OP_WAITING 4 /* cleared when op is woken */
#define FSCACHE_OP_EXCLUSIVE 5 /* exclusive op, other ops must wait */
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
#define FSCACHE_OP_DEC_READ_CNT 6 /* decrement object->n_reads on destruction */
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
#define FSCACHE_OP_UNUSE_COOKIE 7 /* call fscache_unuse_cookie() on completion */
#define FSCACHE_OP_KEEP_FLAGS 0x00f0 /* flags to keep when repurposing an op */
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
enum fscache_operation_state state;
atomic_t usage;
unsigned debug_id; /* debugging ID */
/* operation processor callback
* - can be NULL if FSCACHE_OP_WAITING is going to be used to perform
* the op in a non-pool thread */
fscache_operation_processor_t processor;
/* operation releaser */
fscache_operation_release_t release;
};
extern atomic_t fscache_op_debug_id;
extern void fscache_op_work_func(struct work_struct *work);
extern void fscache_enqueue_operation(struct fscache_operation *);
extern void fscache_op_complete(struct fscache_operation *, bool);
extern void fscache_put_operation(struct fscache_operation *);
/**
* fscache_operation_init - Do basic initialisation of an operation
* @op: The operation to initialise
* @release: The release function to assign
*
* Do basic initialisation of an operation. The caller must still set flags,
* object and processor if needed.
*/
static inline void fscache_operation_init(struct fscache_operation *op,
fscache_operation_processor_t processor,
fscache_operation_release_t release)
{
INIT_WORK(&op->work, fscache_op_work_func);
atomic_set(&op->usage, 1);
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
op->state = FSCACHE_OP_ST_INITIALISED;
op->debug_id = atomic_inc_return(&fscache_op_debug_id);
op->processor = processor;
op->release = release;
INIT_LIST_HEAD(&op->pend_link);
}
/*
* data read operation
*/
struct fscache_retrieval {
struct fscache_operation op;
struct address_space *mapping; /* netfs pages */
fscache_rw_complete_t end_io_func; /* function to call on I/O completion */
void *context; /* netfs read context (pinned) */
struct list_head to_do; /* list of things to be done by the backend */
unsigned long start_time; /* time at which retrieval started */
FS-Cache: The retrieval remaining-pages counter needs to be atomic_t struct fscache_retrieval contains a count of the number of pages that still need some processing (n_pages). This is decremented as the pages are processed. However, this needs to be atomic as fscache_retrieval_complete() (I think) just occasionally may be called from cachefiles_read_backing_file() and cachefiles_read_copier() simultaneously. This happens when an fscache_read_or_alloc_pages() request containing a lot of pages (say a couple of hundred) is being processed. The read on each backing page is dispatched individually because we need to insert a monitor into the waitqueue to catch when the read completes. However, under low-memory conditions, we might be forced to wait in the allocator - and this gives the I/O on the backing page a chance to complete first. When the I/O completes, fscache_enqueue_retrieval() chucks the retrieval onto the workqueue without waiting for the operation to finish the initial I/O dispatch (we want to release any pages we can as soon as we can), thus both can end up running simultaneously and potentially attempting to partially complete the retrieval simultaneously (ENOMEM may occur, backing pages may already be in the page cache). This was demonstrated by parallelling the non-atomic counter with an atomic counter and printing both of them when the assertion fails. At this point, the atomic counter has reached zero, but the non-atomic counter has not. To fix this, make the counter an atomic_t. This results in the following bug appearing FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:421! or FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:414! With a backtrace like the following: RIP: 0010:[<ffffffffa0211b1d>] fscache_put_operation+0x1ad/0x240 [fscache] Call Trace: [<ffffffffa0213185>] fscache_retrieval_work+0x55/0x270 [fscache] [<ffffffffa0213130>] ? fscache_retrieval_work+0x0/0x270 [fscache] [<ffffffff81090b10>] worker_thread+0x170/0x2a0 [<ffffffff81096d10>] ? autoremove_wake_function+0x0/0x40 [<ffffffff810909a0>] ? worker_thread+0x0/0x2a0 [<ffffffff81096966>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968d0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-and-tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-21 20:44:15 +08:00
atomic_t n_pages; /* number of pages to be retrieved */
};
typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
struct page *page,
gfp_t gfp);
typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
struct list_head *pages,
unsigned *nr_pages,
gfp_t gfp);
/**
* fscache_get_retrieval - Get an extra reference on a retrieval operation
* @op: The retrieval operation to get a reference on
*
* Get an extra reference on a retrieval operation.
*/
static inline
struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op)
{
atomic_inc(&op->op.usage);
return op;
}
/**
* fscache_enqueue_retrieval - Enqueue a retrieval operation for processing
* @op: The retrieval operation affected
*
* Enqueue a retrieval operation for processing by the FS-Cache thread pool.
*/
static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op)
{
fscache_enqueue_operation(&op->op);
}
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
/**
* fscache_retrieval_complete - Record (partial) completion of a retrieval
* @op: The retrieval operation affected
* @n_pages: The number of pages to account for
*/
static inline void fscache_retrieval_complete(struct fscache_retrieval *op,
int n_pages)
{
FS-Cache: The retrieval remaining-pages counter needs to be atomic_t struct fscache_retrieval contains a count of the number of pages that still need some processing (n_pages). This is decremented as the pages are processed. However, this needs to be atomic as fscache_retrieval_complete() (I think) just occasionally may be called from cachefiles_read_backing_file() and cachefiles_read_copier() simultaneously. This happens when an fscache_read_or_alloc_pages() request containing a lot of pages (say a couple of hundred) is being processed. The read on each backing page is dispatched individually because we need to insert a monitor into the waitqueue to catch when the read completes. However, under low-memory conditions, we might be forced to wait in the allocator - and this gives the I/O on the backing page a chance to complete first. When the I/O completes, fscache_enqueue_retrieval() chucks the retrieval onto the workqueue without waiting for the operation to finish the initial I/O dispatch (we want to release any pages we can as soon as we can), thus both can end up running simultaneously and potentially attempting to partially complete the retrieval simultaneously (ENOMEM may occur, backing pages may already be in the page cache). This was demonstrated by parallelling the non-atomic counter with an atomic counter and printing both of them when the assertion fails. At this point, the atomic counter has reached zero, but the non-atomic counter has not. To fix this, make the counter an atomic_t. This results in the following bug appearing FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:421! or FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:414! With a backtrace like the following: RIP: 0010:[<ffffffffa0211b1d>] fscache_put_operation+0x1ad/0x240 [fscache] Call Trace: [<ffffffffa0213185>] fscache_retrieval_work+0x55/0x270 [fscache] [<ffffffffa0213130>] ? fscache_retrieval_work+0x0/0x270 [fscache] [<ffffffff81090b10>] worker_thread+0x170/0x2a0 [<ffffffff81096d10>] ? autoremove_wake_function+0x0/0x40 [<ffffffff810909a0>] ? worker_thread+0x0/0x2a0 [<ffffffff81096966>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968d0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-and-tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-21 20:44:15 +08:00
atomic_sub(n_pages, &op->n_pages);
if (atomic_read(&op->n_pages) <= 0)
fscache_op_complete(&op->op, true);
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
}
/**
* fscache_put_retrieval - Drop a reference to a retrieval operation
* @op: The retrieval operation affected
*
* Drop a reference to a retrieval operation.
*/
static inline void fscache_put_retrieval(struct fscache_retrieval *op)
{
fscache_put_operation(&op->op);
}
/*
* cached page storage work item
* - used to do three things:
* - batch writes to the cache
* - do cache writes asynchronously
* - defer writes until cache object lookup completion
*/
struct fscache_storage {
struct fscache_operation op;
pgoff_t store_limit; /* don't write more than this */
};
/*
* cache operations
*/
struct fscache_cache_ops {
/* name of cache provider */
const char *name;
/* allocate an object record for a cookie */
struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
struct fscache_cookie *cookie);
CacheFiles: Catch an overly long wait for an old active object Catch an overly long wait for an old, dying active object when we want to replace it with a new one. The probability is that all the slow-work threads are hogged, and the delete can't get a look in. What we do instead is: (1) if there's nothing in the slow work queue, we sleep until either the dying object has finished dying or there is something in the slow work queue behind which we can queue our object. (2) if there is something in the slow work queue, we return ETIMEDOUT to fscache_lookup_object(), which then puts us back on the slow work queue, presumably behind the deletion that we're blocked by. We are then deferred for a while until we work our way back through the queue - without blocking a slow-work thread unnecessarily. A backtrace similar to the following may appear in the log without this patch: INFO: task kslowd004:5711 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kslowd004 D 0000000000000000 0 5711 2 0x00000080 ffff88000340bb80 0000000000000046 ffff88002550d000 0000000000000000 ffff88002550d000 0000000000000007 ffff88000340bfd8 ffff88002550d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff88002550d2a8 Call Trace: [<ffffffff81058e21>] ? trace_hardirqs_on+0xd/0xf [<ffffffffa011c4d8>] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [<ffffffffa011c4e1>] cachefiles_wait_bit+0x9/0xd [cachefiles] [<ffffffff81353153>] __wait_on_bit+0x43/0x76 [<ffffffff8111ae39>] ? ext3_xattr_get+0x1ec/0x270 [<ffffffff813531ef>] out_of_line_wait_on_bit+0x69/0x74 [<ffffffffa011c4d8>] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [<ffffffff8104c125>] ? wake_bit_function+0x0/0x2e [<ffffffffa011bc79>] cachefiles_mark_object_active+0x203/0x23b [cachefiles] [<ffffffffa011c209>] cachefiles_walk_to_object+0x558/0x827 [cachefiles] [<ffffffffa011a429>] cachefiles_lookup_object+0xac/0x12a [cachefiles] [<ffffffffa00aa1e9>] fscache_lookup_object+0x1c7/0x214 [fscache] [<ffffffffa00aafc5>] fscache_object_state_machine+0xa5/0x52d [fscache] [<ffffffffa00ab4ac>] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [<ffffffff81082093>] slow_work_execute+0x18f/0x2d1 [<ffffffff8108239a>] slow_work_thread+0x1c5/0x308 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34 [<ffffffff810821d5>] ? slow_work_thread+0x0/0x308 [<ffffffff8104be91>] kthread+0x7a/0x82 [<ffffffff8100beda>] child_rip+0xa/0x20 [<ffffffff8100b87c>] ? restore_args+0x0/0x30 [<ffffffff8104be17>] ? kthread+0x0/0x82 [<ffffffff8100bed0>] ? child_rip+0x0/0x20 1 lock held by kslowd004/5711: #0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [<ffffffffa011be64>] cachefiles_walk_to_object+0x1b3/0x827 [cachefiles] Signed-off-by: David Howells <dhowells@redhat.com>
2009-11-20 02:12:05 +08:00
/* look up the object for a cookie
* - return -ETIMEDOUT to be requeued
*/
int (*lookup_object)(struct fscache_object *object);
/* finished looking up */
void (*lookup_complete)(struct fscache_object *object);
/* increment the usage count on this object (may fail if unmounting) */
struct fscache_object *(*grab_object)(struct fscache_object *object);
/* pin an object in the cache */
int (*pin_object)(struct fscache_object *object);
/* unpin an object in the cache */
void (*unpin_object)(struct fscache_object *object);
/* check the consistency between the backing cache and the FS-Cache
* cookie */
bool (*check_consistency)(struct fscache_operation *op);
/* store the updated auxiliary data on an object */
void (*update_object)(struct fscache_object *object);
/* Invalidate an object */
void (*invalidate_object)(struct fscache_operation *op);
/* discard the resources pinned by an object and effect retirement if
* necessary */
void (*drop_object)(struct fscache_object *object);
/* dispose of a reference to an object */
void (*put_object)(struct fscache_object *object);
/* sync a cache */
void (*sync_cache)(struct fscache_cache *cache);
/* notification that the attributes of a non-index object (such as
* i_size) have changed */
int (*attr_changed)(struct fscache_object *object);
/* reserve space for an object's data and associated metadata */
int (*reserve_space)(struct fscache_object *object, loff_t i_size);
/* request a backing block for a page be read or allocated in the
* cache */
fscache_page_retrieval_func_t read_or_alloc_page;
/* request backing blocks for a list of pages be read or allocated in
* the cache */
fscache_pages_retrieval_func_t read_or_alloc_pages;
/* request a backing block for a page be allocated in the cache so that
* it can be written directly */
fscache_page_retrieval_func_t allocate_page;
/* request backing blocks for pages be allocated in the cache so that
* they can be written directly */
fscache_pages_retrieval_func_t allocate_pages;
/* write a page to its backing block in the cache */
int (*write_page)(struct fscache_storage *op, struct page *page);
/* detach backing block from a page (optional)
* - must release the cookie lock before returning
* - may sleep
*/
void (*uncache_page)(struct fscache_object *object,
struct page *page);
/* dissociate a cache from all the pages it was backing */
void (*dissociate_pages)(struct fscache_cache *cache);
};
extern struct fscache_cookie fscache_fsdef_index;
/*
* Event list for fscache_object::{event_mask,events}
*/
enum {
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
FSCACHE_OBJECT_EV_NEW_CHILD, /* T if object has a new child */
FSCACHE_OBJECT_EV_PARENT_READY, /* T if object's parent is ready */
FSCACHE_OBJECT_EV_UPDATE, /* T if object should be updated */
FSCACHE_OBJECT_EV_INVALIDATE, /* T if cache requested object invalidation */
FSCACHE_OBJECT_EV_CLEARED, /* T if accessors all gone */
FSCACHE_OBJECT_EV_ERROR, /* T if fatal error occurred during processing */
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
FSCACHE_OBJECT_EV_KILL, /* T if netfs relinquished or cache withdrew object */
NR_FSCACHE_OBJECT_EVENTS
};
#define FSCACHE_OBJECT_EVENTS_MASK ((1UL << NR_FSCACHE_OBJECT_EVENTS) - 1)
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
/*
* States for object state machine.
*/
struct fscache_transition {
unsigned long events;
const struct fscache_state *transit_to;
};
struct fscache_state {
char name[24];
char short_name[8];
const struct fscache_state *(*work)(struct fscache_object *object,
int event);
const struct fscache_transition transitions[];
};
/*
* on-disk cache file or index handle
*/
struct fscache_object {
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
const struct fscache_state *state; /* Object state machine state */
const struct fscache_transition *oob_table; /* OOB state transition table */
int debug_id; /* debugging ID */
int n_children; /* number of child objects */
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
int n_ops; /* number of extant ops on object */
int n_obj_ops; /* number of object ops outstanding on object */
int n_in_progress; /* number of ops in progress */
FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[<ffffffffa007050a>] [<ffffffffa007050a>] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [<ffffffffa00b2c91>] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [<ffffffffa008f25f>] nfs_clear_inode+0x3c/0x41 [nfs] [<ffffffffa0090df1>] nfs4_evict_inode+0x2f/0x33 [nfs] [<ffffffff810d8d47>] evict+0xa1/0x15c [<ffffffff810d8e2e>] dispose_list+0x2c/0x38 [<ffffffff810d9ebd>] prune_icache_sb+0x28c/0x29b [<ffffffff810c56b7>] prune_super+0xd5/0x140 [<ffffffff8109b615>] shrink_slab+0x102/0x1ab [<ffffffff8109d690>] balance_pgdat+0x2f2/0x595 [<ffffffff8103e009>] ? process_timeout+0xb/0xb [<ffffffff8109dba3>] kswapd+0x270/0x289 [<ffffffff8104c5ea>] ? __init_waitqueue_head+0x46/0x46 [<ffffffff8109d933>] ? balance_pgdat+0x595/0x595 [<ffffffff8104bf7a>] kthread+0x7f/0x87 [<ffffffff813ad6b4>] kernel_thread_helper+0x4/0x10 [<ffffffff81026b98>] ? finish_task_switch+0x45/0xc0 [<ffffffff813abcdd>] ? retint_restore_args+0xe/0xe [<ffffffff8104befb>] ? __init_kthread_worker+0x53/0x53 [<ffffffff813ad6b0>] ? gs_change+0xb/0xb Signed-off-by: David Howells <dhowells@redhat.com>
2012-12-21 05:52:35 +08:00
int n_exclusive; /* number of exclusive ops queued or in progress */
atomic_t n_reads; /* number of read ops in progress */
spinlock_t lock; /* state and operations lock */
unsigned long lookup_jif; /* time at which lookup started */
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
unsigned long oob_event_mask; /* OOB events this object is interested in */
unsigned long event_mask; /* events this object is interested in */
unsigned long events; /* events to be processed by this object
* (order is important - using fls) */
unsigned long flags;
#define FSCACHE_OBJECT_LOCK 0 /* T if object is busy being processed */
#define FSCACHE_OBJECT_PENDING_WRITE 1 /* T if object has pending write */
#define FSCACHE_OBJECT_WAITING 2 /* T if object is waiting on its parent */
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
#define FSCACHE_OBJECT_IS_LIVE 3 /* T if object is not withdrawn or relinquished */
#define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */
#define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */
FS-Cache: Provide the ability to enable/disable cookies Provide the ability to enable and disable fscache cookies. A disabled cookie will reject or ignore further requests to: Acquire a child cookie Invalidate and update backing objects Check the consistency of a backing object Allocate storage for backing page Read backing pages Write to backing pages but still allows: Checks/waits on the completion of already in-progress objects Uncaching of pages Relinquishment of cookies Two new operations are provided: (1) Disable a cookie: void fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate); If the cookie is not already disabled, this locks the cookie against other dis/enablement ops, marks the cookie as being disabled, discards or invalidates any backing objects and waits for cessation of activity on any associated object. This is a wrapper around a chunk split out of fscache_relinquish_cookie(), but it reinitialises the cookie such that it can be reenabled. All possible failures are handled internally. The caller should consider calling fscache_uncache_all_inode_pages() afterwards to make sure all page markings are cleared up. (2) Enable a cookie: void fscache_enable_cookie(struct fscache_cookie *cookie, bool (*can_enable)(void *data), void *data) If the cookie is not already enabled, this locks the cookie against other dis/enablement ops, invokes can_enable() and, if the cookie is not an index cookie, will begin the procedure of acquiring backing objects. The optional can_enable() function is passed the data argument and returns a ruling as to whether or not enablement should actually be permitted to begin. All possible failures are handled internally. The cookie will only be marked as enabled if provisional backing objects are allocated. A later patch will introduce these to NFS. Cookie enablement during nfs_open() is then contingent on i_writecount <= 0. can_enable() checks for a race between open(O_RDONLY) and open(O_WRONLY/O_RDWR). This simplifies NFS's cookie handling and allows us to get rid of open(O_RDONLY) accidentally introducing caching to an inode that's open for writing already. One operation has its API modified: (3) Acquire a cookie. struct fscache_cookie *fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, void *netfs_data, bool enable); This now has an additional argument that indicates whether the requested cookie should be enabled by default. It doesn't need the can_enable() function because the caller must prevent multiple calls for the same netfs object and it doesn't need to take the enablement lock because no one else can get at the cookie before this returns. Signed-off-by: David Howells <dhowells@redhat.com
2013-09-21 07:09:31 +08:00
#define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */
struct list_head cache_link; /* link in cache->object_list */
struct hlist_node cookie_link; /* link in cookie->backing_objects */
struct fscache_cache *cache; /* cache that supplied this object */
struct fscache_cookie *cookie; /* netfs's file/index object */
struct fscache_object *parent; /* parent object */
struct work_struct work; /* attention scheduling record */
struct list_head dependents; /* FIFO of dependent objects */
struct list_head dep_link; /* link in parent's dependents list */
struct list_head pending_ops; /* unstarted operations on this object */
#ifdef CONFIG_FSCACHE_OBJECT_LIST
struct rb_node objlist_link; /* link in global object list */
#endif
pgoff_t store_limit; /* current storage limit */
loff_t store_limit_l; /* current storage limit */
};
extern void fscache_object_init(struct fscache_object *, struct fscache_cookie *,
struct fscache_cache *);
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
extern void fscache_object_destroy(struct fscache_object *);
extern void fscache_object_lookup_negative(struct fscache_object *object);
extern void fscache_obtained_object(struct fscache_object *object);
static inline bool fscache_object_is_live(struct fscache_object *object)
{
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
return test_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
}
static inline bool fscache_object_is_dying(struct fscache_object *object)
{
return !fscache_object_is_live(object);
}
static inline bool fscache_object_is_available(struct fscache_object *object)
{
FS-Cache: Fix object state machine to have separate work and wait states Fix object state machine to have separate work and wait states as that makes it easier to envision. There are now three kinds of state: (1) Work state. This is an execution state. No event processing is performed by a work state. The function attached to a work state returns a pointer indicating the next state to which the OSM should transition. Returning NO_TRANSIT repeats the current state, but goes back to the scheduler first. (2) Wait state. This is an event processing state. No execution is performed by a wait state. Wait states are just tables of "if event X occurs, clear it and transition to state Y". The dispatcher returns to the scheduler if none of the events in which the wait state has an interest are currently pending. (3) Out-of-band state. This is a special work state. Transitions to normal states can be overridden when an unexpected event occurs (eg. I/O error). Instead the dispatcher disables and clears the OOB event and transits to the specified work state. This then acts as an ordinary work state, though object->state points to the overridden destination. Returning NO_TRANSIT resumes the overridden transition. In addition, the states have names in their definitions, so there's no need for tables of state names. Further, the EV_REQUEUE event is no longer necessary as that is automatic for work states. Since the states are now separate structs rather than values in an enum, it's not possible to use comparisons other than (non-)equality between them, so use some object->flags to indicate what phase an object is in. The EV_RELEASE, EV_RETIRE and EV_WITHDRAW events have been squished into one (EV_KILL). An object flag now carries the information about retirement. Similarly, the RELEASING, RECYCLING and WITHDRAWING states have been merged into an KILL_OBJECT state and additional states have been added for handling waiting dependent objects (JUMPSTART_DEPS and KILL_DEPENDENTS). A state has also been added for synchronising with parent object initialisation (WAIT_FOR_PARENT) and another for initiating look up (PARENT_READY). Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
}
static inline bool fscache_object_is_active(struct fscache_object *object)
{
return fscache_object_is_available(object) &&
fscache_object_is_live(object) &&
!test_bit(FSCACHE_IOERROR, &object->cache->flags);
}
static inline bool fscache_object_is_dead(struct fscache_object *object)
{
return fscache_object_is_dying(object) &&
test_bit(FSCACHE_IOERROR, &object->cache->flags);
}
/**
* fscache_object_destroyed - Note destruction of an object in a cache
* @cache: The cache from which the object came
*
* Note the destruction and deallocation of an object record in a cache.
*/
static inline void fscache_object_destroyed(struct fscache_cache *cache)
{
if (atomic_dec_and_test(&cache->object_count))
wake_up_all(&fscache_cache_cleared_wq);
}
/**
* fscache_object_lookup_error - Note an object encountered an error
* @object: The object on which the error was encountered
*
* Note that an object encountered a fatal error (usually an I/O error) and
* that it should be withdrawn as soon as possible.
*/
static inline void fscache_object_lookup_error(struct fscache_object *object)
{
set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events);
}
/**
* fscache_set_store_limit - Set the maximum size to be stored in an object
* @object: The object to set the maximum on
* @i_size: The limit to set in bytes
*
* Set the maximum size an object is permitted to reach, implying the highest
* byte that may be written. Intended to be called by the attr_changed() op.
*
* See Documentation/filesystems/caching/backend-api.txt for a complete
* description.
*/
static inline
void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
{
object->store_limit_l = i_size;
object->store_limit = i_size >> PAGE_SHIFT;
if (i_size & ~PAGE_MASK)
object->store_limit++;
}
/**
* fscache_end_io - End a retrieval operation on a page
* @op: The FS-Cache operation covering the retrieval
* @page: The page that was to be fetched
* @error: The error code (0 if successful)
*
* Note the end of an operation to retrieve a page, as covered by a particular
* operation record.
*/
static inline void fscache_end_io(struct fscache_retrieval *op,
struct page *page, int error)
{
op->end_io_func(page, op->context, error);
}
static inline void __fscache_use_cookie(struct fscache_cookie *cookie)
{
atomic_inc(&cookie->n_active);
}
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
/**
* fscache_use_cookie - Request usage of cookie attached to an object
* @object: Object description
*
* Request usage of the cookie attached to an object. NULL is returned if the
* relinquishment had reduced the cookie usage count to 0.
*/
static inline bool fscache_use_cookie(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
return atomic_inc_not_zero(&cookie->n_active) != 0;
}
static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie)
{
return atomic_dec_and_test(&cookie->n_active);
}
static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie)
{
wake_up_atomic_t(&cookie->n_active);
}
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
/**
* fscache_unuse_cookie - Cease usage of cookie attached to an object
* @object: Object description
*
* Cease usage of the cookie attached to an object. When the users count
* reaches zero then the cookie relinquishment will be permitted to proceed.
*/
static inline void fscache_unuse_cookie(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
if (__fscache_unuse_cookie(cookie))
__fscache_wake_unused_cookie(cookie);
FS-Cache: Simplify cookie retention for fscache_objects, fixing oops Simplify the way fscache cache objects retain their cookie. The way I implemented the cookie storage handling made synchronisation a pain (ie. the object state machine can't rely on the cookie actually still being there). Instead of the the object being detached from the cookie and the cookie being freed in __fscache_relinquish_cookie(), we defer both operations: (*) The detachment of the object from the list in the cookie now takes place in fscache_drop_object() and is thus governed by the object state machine (fscache_detach_from_cookie() has been removed). (*) The release of the cookie is now in fscache_object_destroy() - which is called by the cache backend just before it frees the object. This means that the fscache_cookie struct is now available to the cache all the way through from ->alloc_object() to ->drop_object() and ->put_object() - meaning that it's no longer necessary to take object->lock to guarantee access. However, __fscache_relinquish_cookie() doesn't wait for the object to go all the way through to destruction before letting the netfs proceed. That would massively slow down the netfs. Since __fscache_relinquish_cookie() leaves the cookie around, in must therefore break all attachments to the netfs - which includes ->def, ->netfs_data and any outstanding page read/writes. To handle this, struct fscache_cookie now has an n_active counter: (1) This starts off initialised to 1. (2) Any time the cache needs to get at the netfs data, it calls fscache_use_cookie() to increment it - if it is not zero. If it was zero, then access is not permitted. (3) When the cache has finished with the data, it calls fscache_unuse_cookie() to decrement it. This does a wake-up on it if it reaches 0. (4) __fscache_relinquish_cookie() decrements n_active and then waits for it to reach 0. The initialisation to 1 in step (1) ensures that we only get wake ups when we're trying to get rid of the cookie. This leaves __fscache_relinquish_cookie() a lot simpler. *** This fixes a problem in the current code whereby if fscache_invalidate() is followed sufficiently quickly by fscache_relinquish_cookie() then it is possible for __fscache_relinquish_cookie() to have detached the cookie from the object and cleared the pointer before a thread is dispatched to process the invalidation state in the object state machine. Since the pending write clearance was deferred to the invalidation state to make it asynchronous, we need to either wait in relinquishment for the stores tree to be cleared in the invalidation state or we need to handle the clearance in relinquishment. Further, if the relinquishment code does clear the tree, then the invalidation state need to make the clearance contingent on still having the cookie to hand (since that's where the tree is rooted) and we have to prevent the cookie from disappearing for the duration. This can lead to an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 000000000000000c ... RIP: 0010:[<ffffffff8151023e>] _spin_lock+0xe/0x30 ... CR2: 000000000000000c ... ... Process kslowd002 (...) .... Call Trace: [<ffffffffa01c3278>] fscache_invalidate_writes+0x38/0xd0 [fscache] [<ffffffff810096f0>] ? __switch_to+0xd0/0x320 [<ffffffff8105e759>] ? find_busiest_queue+0x69/0x150 [<ffffffff8110ddd4>] ? slow_work_enqueue+0x104/0x180 [<ffffffffa01c1303>] fscache_object_slow_work_execute+0x5e3/0x9d0 [fscache] [<ffffffff81096b67>] ? bit_waitqueue+0x17/0xd0 [<ffffffff8110e233>] slow_work_execute+0x233/0x310 [<ffffffff8110e515>] slow_work_thread+0x205/0x360 [<ffffffff81096ca0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff8110e310>] ? slow_work_thread+0x0/0x360 [<ffffffff81096936>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810968a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 The parameter to fscache_invalidate_writes() was object->cookie which is NULL. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Milosz Tanski <milosz@adfin.com> Acked-by: Jeff Layton <jlayton@redhat.com>
2013-05-11 02:50:26 +08:00
}
/*
* out-of-line cache backend functions
*/
extern __printf(3, 4)
void fscache_init_cache(struct fscache_cache *cache,
const struct fscache_cache_ops *ops,
const char *idfmt, ...);
extern int fscache_add_cache(struct fscache_cache *cache,
struct fscache_object *fsdef,
const char *tagname);
extern void fscache_withdraw_cache(struct fscache_cache *cache);
extern void fscache_io_error(struct fscache_cache *cache);
CacheFiles: Fix the marking of cached pages Under some circumstances CacheFiles defers the marking of pages with PG_fscache so that it can take advantage of pagevecs to reduce the number of calls to fscache_mark_pages_cached() and the netfs's hook to keep track of this. There are, however, two problems with this: (1) It can lead to the PG_fscache mark being applied _after_ the page is set PG_uptodate and unlocked (by the call to fscache_end_io()). (2) CacheFiles's ref on the page is dropped immediately following fscache_end_io() - and so may not still be held when the mark is applied. This can lead to the page being passed back to the allocator before the mark is applied. Fix this by, where appropriate, marking the page before calling fscache_end_io() and releasing the page. This means that we can't take advantage of pagevecs and have to make a separate call for each page to the marking routines. The symptoms of this are Bad Page state errors cropping up under memory pressure, for example: BUG: Bad page state in process tar pfn:002da page:ffffea0000009fb0 count:0 mapcount:0 mapping: (null) index:0x1447 page flags: 0x1000(private_2) Pid: 4574, comm: tar Tainted: G W 3.1.0-rc4-fsdevel+ #1064 Call Trace: [<ffffffff8109583c>] ? dump_page+0xb9/0xbe [<ffffffff81095916>] bad_page+0xd5/0xea [<ffffffff81095d82>] get_page_from_freelist+0x35b/0x46a [<ffffffff810961f3>] __alloc_pages_nodemask+0x362/0x662 [<ffffffff810989da>] __do_page_cache_readahead+0x13a/0x267 [<ffffffff81098942>] ? __do_page_cache_readahead+0xa2/0x267 [<ffffffff81098d7b>] ra_submit+0x1c/0x20 [<ffffffff8109900a>] ondemand_readahead+0x28b/0x29a [<ffffffff81098ee2>] ? ondemand_readahead+0x163/0x29a [<ffffffff810990ce>] page_cache_sync_readahead+0x38/0x3a [<ffffffff81091d8a>] generic_file_aio_read+0x2ab/0x67e [<ffffffffa008cfbe>] nfs_file_read+0xa4/0xc9 [nfs] [<ffffffff810c22c4>] do_sync_read+0xba/0xfa [<ffffffff81177a47>] ? security_file_permission+0x7b/0x84 [<ffffffff810c25dd>] ? rw_verify_area+0xab/0xc8 [<ffffffff810c29a4>] vfs_read+0xaa/0x13a [<ffffffff810c2a79>] sys_read+0x45/0x6c [<ffffffff813ac37b>] system_call_fastpath+0x16/0x1b As can be seen, PG_private_2 (== PG_fscache) is set in the page flags. Instrumenting fscache_mark_pages_cached() to verify whether page->mapping was set appropriately showed that sometimes it wasn't. This led to the discovery that sometimes the page has apparently been reclaimed by the time the marker got to see it. Reported-by: M. Stevens <m@tippett.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com>
2012-12-21 05:52:32 +08:00
extern void fscache_mark_page_cached(struct fscache_retrieval *op,
struct page *page);
extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
struct pagevec *pagevec);
extern bool fscache_object_sleep_till_congested(signed long *timeoutp);
extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
const void *data,
uint16_t datalen);
#endif /* _LINUX_FSCACHE_CACHE_H */