ocfs2: zero tail of sparse files on truncate
Since we don't zero on extend anymore, truncate needs to be fixed up to zero the part of a file between i_size and and end of it's cluster. Otherwise a subsequent extend could expose bad data. This introduced a new helper, which can be used in ocfs2_write(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
This commit is contained in:
parent
25baf2da14
commit
60b11392f1
224
fs/ocfs2/alloc.c
224
fs/ocfs2/alloc.c
|
@ -27,6 +27,7 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
|
||||
#include <cluster/masklog.h>
|
||||
|
@ -34,6 +35,7 @@
|
|||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "aops.h"
|
||||
#include "dlmglue.h"
|
||||
#include "extent_map.h"
|
||||
#include "inode.h"
|
||||
|
@ -3342,6 +3344,228 @@ bail:
|
|||
return status;
|
||||
}
|
||||
|
||||
static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
|
||||
{
|
||||
set_buffer_uptodate(bh);
|
||||
mark_buffer_dirty(bh);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
|
||||
{
|
||||
set_buffer_uptodate(bh);
|
||||
mark_buffer_dirty(bh);
|
||||
return ocfs2_journal_dirty_data(handle, bh);
|
||||
}
|
||||
|
||||
static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
|
||||
struct page **pages, int numpages,
|
||||
u64 phys, handle_t *handle)
|
||||
{
|
||||
int i, ret, partial = 0;
|
||||
void *kaddr;
|
||||
struct page *page;
|
||||
unsigned int from, to = PAGE_CACHE_SIZE;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
|
||||
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
|
||||
|
||||
if (numpages == 0)
|
||||
goto out;
|
||||
|
||||
from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
|
||||
if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
|
||||
/*
|
||||
* Since 'from' has been capped to a value below page
|
||||
* size, this calculation won't be able to overflow
|
||||
* 'to'
|
||||
*/
|
||||
to = ocfs2_align_bytes_to_clusters(sb, from);
|
||||
|
||||
/*
|
||||
* The truncate tail in this case should never contain
|
||||
* more than one page at maximum. The loop below also
|
||||
* assumes this.
|
||||
*/
|
||||
BUG_ON(numpages != 1);
|
||||
}
|
||||
|
||||
for(i = 0; i < numpages; i++) {
|
||||
page = pages[i];
|
||||
|
||||
BUG_ON(from > PAGE_CACHE_SIZE);
|
||||
BUG_ON(to > PAGE_CACHE_SIZE);
|
||||
|
||||
ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
|
||||
if (ret)
|
||||
mlog_errno(ret);
|
||||
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
memset(kaddr + from, 0, to - from);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
|
||||
/*
|
||||
* Need to set the buffers we zero'd into uptodate
|
||||
* here if they aren't - ocfs2_map_page_blocks()
|
||||
* might've skipped some
|
||||
*/
|
||||
if (ocfs2_should_order_data(inode)) {
|
||||
ret = walk_page_buffers(handle,
|
||||
page_buffers(page),
|
||||
from, to, &partial,
|
||||
ocfs2_ordered_zero_func);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
} else {
|
||||
ret = walk_page_buffers(handle, page_buffers(page),
|
||||
from, to, &partial,
|
||||
ocfs2_writeback_zero_func);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
}
|
||||
|
||||
if (!partial)
|
||||
SetPageUptodate(page);
|
||||
|
||||
flush_dcache_page(page);
|
||||
|
||||
/*
|
||||
* Every page after the 1st one should be completely zero'd.
|
||||
*/
|
||||
from = 0;
|
||||
}
|
||||
out:
|
||||
if (pages) {
|
||||
for (i = 0; i < numpages; i++) {
|
||||
page = pages[i];
|
||||
unlock_page(page);
|
||||
mark_page_accessed(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
|
||||
int *num, u64 *phys)
|
||||
{
|
||||
int i, numpages = 0, ret = 0;
|
||||
unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
unsigned long index;
|
||||
u64 next_cluster_bytes;
|
||||
|
||||
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
|
||||
|
||||
/* Cluster boundary, so we don't need to grab any pages. */
|
||||
if ((isize & (csize - 1)) == 0)
|
||||
goto out;
|
||||
|
||||
ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
|
||||
phys, NULL);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Tail is a hole. */
|
||||
if (*phys == 0)
|
||||
goto out;
|
||||
|
||||
next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
|
||||
index = isize >> PAGE_CACHE_SHIFT;
|
||||
do {
|
||||
pages[numpages] = grab_cache_page(mapping, index);
|
||||
if (!pages[numpages]) {
|
||||
ret = -ENOMEM;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
numpages++;
|
||||
index++;
|
||||
} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
|
||||
|
||||
out:
|
||||
if (ret != 0) {
|
||||
if (pages) {
|
||||
for (i = 0; i < numpages; i++) {
|
||||
if (pages[i]) {
|
||||
unlock_page(pages[i]);
|
||||
page_cache_release(pages[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
numpages = 0;
|
||||
}
|
||||
|
||||
*num = numpages;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Zero the area past i_size but still within an allocated
|
||||
* cluster. This avoids exposing nonzero data on subsequent file
|
||||
* extends.
|
||||
*
|
||||
* We need to call this before i_size is updated on the inode because
|
||||
* otherwise block_write_full_page() will skip writeout of pages past
|
||||
* i_size. The new_i_size parameter is passed for this reason.
|
||||
*/
|
||||
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
|
||||
u64 new_i_size)
|
||||
{
|
||||
int ret, numpages;
|
||||
struct page **pages = NULL;
|
||||
u64 phys;
|
||||
|
||||
/*
|
||||
* File systems which don't support sparse files zero on every
|
||||
* extend.
|
||||
*/
|
||||
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
|
||||
return 0;
|
||||
|
||||
pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
|
||||
sizeof(struct page *), GFP_NOFS);
|
||||
if (pages == NULL) {
|
||||
ret = -ENOMEM;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Truncate on an i_size boundary - nothing more to do.
|
||||
*/
|
||||
if (numpages == 0)
|
||||
goto out;
|
||||
|
||||
ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
|
||||
handle);
|
||||
|
||||
/*
|
||||
* Initiate writeout of the pages we zero'd here. We don't
|
||||
* wait on them - the truncate_inode_pages() call later will
|
||||
* do that for us.
|
||||
*/
|
||||
ret = filemap_fdatawrite(inode->i_mapping);
|
||||
if (ret)
|
||||
mlog_errno(ret);
|
||||
|
||||
out:
|
||||
if (pages)
|
||||
kfree(pages);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* It is expected, that by the time you call this function,
|
||||
* inode->i_size and fe->i_size have been adjusted.
|
||||
|
|
|
@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
|
|||
struct buffer_head *tc_last_eb_bh;
|
||||
};
|
||||
|
||||
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
|
||||
u64 new_i_size);
|
||||
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
struct buffer_head *fe_bh,
|
||||
|
|
|
@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
|
|||
* functionality yet, but IMHO it's better to cut and paste the whole
|
||||
* thing so we can avoid introducing our own bugs (and easily pick up
|
||||
* their fixes when they happen) --Mark */
|
||||
static int walk_page_buffers( handle_t *handle,
|
||||
struct buffer_head *head,
|
||||
unsigned from,
|
||||
unsigned to,
|
||||
int *partial,
|
||||
int (*fn)( handle_t *handle,
|
||||
struct buffer_head *bh))
|
||||
int walk_page_buffers( handle_t *handle,
|
||||
struct buffer_head *head,
|
||||
unsigned from,
|
||||
unsigned to,
|
||||
int *partial,
|
||||
int (*fn)( handle_t *handle,
|
||||
struct buffer_head *bh))
|
||||
{
|
||||
struct buffer_head *bh;
|
||||
unsigned block_start, block_end;
|
||||
|
@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page,
|
|||
*
|
||||
* This will also skip zeroing, which is handled externally.
|
||||
*/
|
||||
static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
||||
struct inode *inode, unsigned int from,
|
||||
unsigned int to, int new)
|
||||
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
||||
struct inode *inode, unsigned int from,
|
||||
unsigned int to, int new)
|
||||
{
|
||||
int ret = 0;
|
||||
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
|
||||
|
@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
|||
* Ignore blocks outside of our i/o range -
|
||||
* they may belong to unallocated clusters.
|
||||
*/
|
||||
if (block_start >= to ||
|
||||
(block_start + bsize) <= from) {
|
||||
if (block_start >= to || block_end <= from) {
|
||||
if (PageUptodate(page))
|
||||
set_buffer_uptodate(bh);
|
||||
continue;
|
||||
|
@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
|
|||
u64 v_blkno, p_blkno;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
|
||||
unsigned long index, start;
|
||||
struct page **cpages;
|
||||
|
||||
|
@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
|
|||
|
||||
/*
|
||||
* Figure out how many pages we'll be manipulating here. For
|
||||
* non-allocating write, or any writes where cluster size is
|
||||
* less than page size, we only need one page. Otherwise,
|
||||
* allocating writes of cluster size larger than page size
|
||||
* need cluster size pages.
|
||||
* non allocating write, we just change the one
|
||||
* page. Otherwise, we'll need a whole clusters worth.
|
||||
*/
|
||||
if (new && !wc->w_large_pages)
|
||||
numpages = (1 << cbits) / PAGE_SIZE;
|
||||
if (new)
|
||||
numpages = ocfs2_pages_per_cluster(inode->i_sb);
|
||||
|
||||
cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
|
||||
if (!cpages) {
|
||||
|
|
|
@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
|
|||
unsigned from,
|
||||
unsigned to);
|
||||
|
||||
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
||||
struct inode *inode, unsigned int from,
|
||||
unsigned int to, int new);
|
||||
|
||||
int walk_page_buffers( handle_t *handle,
|
||||
struct buffer_head *head,
|
||||
unsigned from,
|
||||
unsigned to,
|
||||
int *partial,
|
||||
int (*fn)( handle_t *handle,
|
||||
struct buffer_head *bh));
|
||||
|
||||
struct ocfs2_write_ctxt;
|
||||
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
|
||||
u64 *, unsigned int *, unsigned int *);
|
||||
|
|
|
@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
|
|||
{
|
||||
int status;
|
||||
handle_t *handle;
|
||||
struct ocfs2_dinode *di;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
|
@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
|
|||
goto out;
|
||||
}
|
||||
|
||||
status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
|
||||
status = ocfs2_journal_access(handle, inode, fe_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto out_commit;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do this before setting i_size.
|
||||
*/
|
||||
status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
|
||||
if (status) {
|
||||
mlog_errno(status);
|
||||
goto out_commit;
|
||||
}
|
||||
|
||||
i_size_write(inode, new_i_size);
|
||||
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
|
||||
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
|
||||
|
||||
di = (struct ocfs2_dinode *) fe_bh->b_data;
|
||||
di->i_size = cpu_to_le64(new_i_size);
|
||||
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
|
||||
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
|
||||
|
||||
status = ocfs2_journal_dirty(handle, fe_bh);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
out_commit:
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
out:
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
|
|||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
ocfs2_data_unlock(inode, 1);
|
||||
|
||||
/* alright, we're going to need to do a full blown alloc size
|
||||
* change. Orphan the inode so that recovery can complete the
|
||||
|
@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
|
|||
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
goto bail_unlock_data;
|
||||
}
|
||||
|
||||
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
goto bail_unlock_data;
|
||||
}
|
||||
|
||||
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
goto bail_unlock_data;
|
||||
}
|
||||
|
||||
/* TODO: orphan dir cleanup here. */
|
||||
bail_unlock_data:
|
||||
ocfs2_data_unlock(inode, 1);
|
||||
|
||||
bail:
|
||||
|
||||
mlog_exit(status);
|
||||
|
|
|
@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
|
|||
int status = 0;
|
||||
struct ocfs2_truncate_context *tc = NULL;
|
||||
struct ocfs2_dinode *fe;
|
||||
handle_t *handle = NULL;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
fe = (struct ocfs2_dinode *) fe_bh->b_data;
|
||||
|
||||
if (fe->i_clusters) {
|
||||
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
|
||||
if (IS_ERR(handle)) {
|
||||
status = PTR_ERR(handle);
|
||||
mlog_errno(status);
|
||||
goto out;
|
||||
}
|
||||
|
||||
status = ocfs2_journal_access(handle, inode, fe_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto out;
|
||||
}
|
||||
|
||||
i_size_write(inode, 0);
|
||||
|
||||
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
handle = NULL;
|
||||
|
||||
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
|
@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
|
|||
goto out;
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
||||
out:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
|
|
@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
|
|||
return index;
|
||||
}
|
||||
|
||||
static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
|
||||
{
|
||||
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
|
||||
unsigned int pages_per_cluster = 1;
|
||||
|
||||
if (PAGE_CACHE_SHIFT < cbits)
|
||||
pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
|
||||
|
||||
return pages_per_cluster;
|
||||
}
|
||||
|
||||
#define ocfs2_set_bit ext2_set_bit
|
||||
#define ocfs2_clear_bit ext2_clear_bit
|
||||
#define ocfs2_test_bit ext2_test_bit
|
||||
|
|
Loading…
Reference in New Issue