Commit 60b11392 authored by Mark Fasheh's avatar Mark Fasheh

ocfs2: zero tail of sparse files on truncate

Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.

This introduced a new helper, which can be used in ocfs2_write().
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
parent 25baf2da
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC #define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h> #include <cluster/masklog.h>
...@@ -34,6 +35,7 @@ ...@@ -34,6 +35,7 @@
#include "ocfs2.h" #include "ocfs2.h"
#include "alloc.h" #include "alloc.h"
#include "aops.h"
#include "dlmglue.h" #include "dlmglue.h"
#include "extent_map.h" #include "extent_map.h"
#include "inode.h" #include "inode.h"
...@@ -3342,6 +3344,228 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, ...@@ -3342,6 +3344,228 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
return status; return status;
} }
static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return 0;
}
static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return ocfs2_journal_dirty_data(handle, bh);
}
static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
struct page **pages, int numpages,
u64 phys, handle_t *handle)
{
int i, ret, partial = 0;
void *kaddr;
struct page *page;
unsigned int from, to = PAGE_CACHE_SIZE;
struct super_block *sb = inode->i_sb;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
if (numpages == 0)
goto out;
from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
/*
* Since 'from' has been capped to a value below page
* size, this calculation won't be able to overflow
* 'to'
*/
to = ocfs2_align_bytes_to_clusters(sb, from);
/*
* The truncate tail in this case should never contain
* more than one page at maximum. The loop below also
* assumes this.
*/
BUG_ON(numpages != 1);
}
for(i = 0; i < numpages; i++) {
page = pages[i];
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
if (ret)
mlog_errno(ret);
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + from, 0, to - from);
kunmap_atomic(kaddr, KM_USER0);
/*
* Need to set the buffers we zero'd into uptodate
* here if they aren't - ocfs2_map_page_blocks()
* might've skipped some
*/
if (ocfs2_should_order_data(inode)) {
ret = walk_page_buffers(handle,
page_buffers(page),
from, to, &partial,
ocfs2_ordered_zero_func);
if (ret < 0)
mlog_errno(ret);
} else {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, &partial,
ocfs2_writeback_zero_func);
if (ret < 0)
mlog_errno(ret);
}
if (!partial)
SetPageUptodate(page);
flush_dcache_page(page);
/*
* Every page after the 1st one should be completely zero'd.
*/
from = 0;
}
out:
if (pages) {
for (i = 0; i < numpages; i++) {
page = pages[i];
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
}
}
}
static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
int *num, u64 *phys)
{
int i, numpages = 0, ret = 0;
unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
u64 next_cluster_bytes;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
/* Cluster boundary, so we don't need to grab any pages. */
if ((isize & (csize - 1)) == 0)
goto out;
ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
phys, NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
/* Tail is a hole. */
if (*phys == 0)
goto out;
next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
index = isize >> PAGE_CACHE_SHIFT;
do {
pages[numpages] = grab_cache_page(mapping, index);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
numpages++;
index++;
} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
out:
if (ret != 0) {
if (pages) {
for (i = 0; i < numpages; i++) {
if (pages[i]) {
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
}
}
numpages = 0;
}
*num = numpages;
return ret;
}
/*
* Zero the area past i_size but still within an allocated
* cluster. This avoids exposing nonzero data on subsequent file
* extends.
*
* We need to call this before i_size is updated on the inode because
* otherwise block_write_full_page() will skip writeout of pages past
* i_size. The new_i_size parameter is passed for this reason.
*/
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
u64 new_i_size)
{
int ret, numpages;
struct page **pages = NULL;
u64 phys;
/*
* File systems which don't support sparse files zero on every
* extend.
*/
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
return 0;
pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* Truncate on an i_size boundary - nothing more to do.
*/
if (numpages == 0)
goto out;
ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
handle);
/*
* Initiate writeout of the pages we zero'd here. We don't
* wait on them - the truncate_inode_pages() call later will
* do that for us.
*/
ret = filemap_fdatawrite(inode->i_mapping);
if (ret)
mlog_errno(ret);
out:
if (pages)
kfree(pages);
return ret;
}
/* /*
* It is expected, that by the time you call this function, * It is expected, that by the time you call this function,
* inode->i_size and fe->i_size have been adjusted. * inode->i_size and fe->i_size have been adjusted.
......
...@@ -71,6 +71,8 @@ struct ocfs2_truncate_context { ...@@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
struct buffer_head *tc_last_eb_bh; struct buffer_head *tc_last_eb_bh;
}; };
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb, int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode, struct inode *inode,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
......
...@@ -308,7 +308,7 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, ...@@ -308,7 +308,7 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
* functionality yet, but IMHO it's better to cut and paste the whole * functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up * thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */ * their fixes when they happen) --Mark */
static int walk_page_buffers( handle_t *handle, int walk_page_buffers( handle_t *handle,
struct buffer_head *head, struct buffer_head *head,
unsigned from, unsigned from,
unsigned to, unsigned to,
...@@ -654,7 +654,7 @@ static void ocfs2_clear_page_regions(struct page *page, ...@@ -654,7 +654,7 @@ static void ocfs2_clear_page_regions(struct page *page,
* *
* This will also skip zeroing, which is handled externally. * This will also skip zeroing, which is handled externally.
*/ */
static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from, struct inode *inode, unsigned int from,
unsigned int to, int new) unsigned int to, int new)
{ {
...@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, ...@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
* Ignore blocks outside of our i/o range - * Ignore blocks outside of our i/o range -
* they may belong to unallocated clusters. * they may belong to unallocated clusters.
*/ */
if (block_start >= to || if (block_start >= to || block_end <= from) {
(block_start + bsize) <= from) {
if (PageUptodate(page)) if (PageUptodate(page))
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
continue; continue;
...@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, ...@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
u64 v_blkno, p_blkno; u64 v_blkno, p_blkno;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
unsigned long index, start; unsigned long index, start;
struct page **cpages; struct page **cpages;
...@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, ...@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
/* /*
* Figure out how many pages we'll be manipulating here. For * Figure out how many pages we'll be manipulating here. For
* non-allocating write, or any writes where cluster size is * non allocating write, we just change the one
* less than page size, we only need one page. Otherwise, * page. Otherwise, we'll need a whole clusters worth.
* allocating writes of cluster size larger than page size
* need cluster size pages.
*/ */
if (new && !wc->w_large_pages) if (new)
numpages = (1 << cbits) / PAGE_SIZE; numpages = ocfs2_pages_per_cluster(inode->i_sb);
cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
if (!cpages) { if (!cpages) {
......
...@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, ...@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
unsigned from, unsigned from,
unsigned to); unsigned to);
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new);
int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh));
struct ocfs2_write_ctxt; struct ocfs2_write_ctxt;
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
u64 *, unsigned int *, unsigned int *); u64 *, unsigned int *, unsigned int *);
......
...@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, ...@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
{ {
int status; int status;
handle_t *handle; handle_t *handle;
struct ocfs2_dinode *di;
mlog_entry_void(); mlog_entry_void();
...@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, ...@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out; goto out;
} }
status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
}
/*
* Do this before setting i_size.
*/
status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
if (status) {
mlog_errno(status);
goto out_commit;
}
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0) if (status < 0)
mlog_errno(status); mlog_errno(status);
out_commit:
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
out: out:
mlog_exit(status); mlog_exit(status);
return status; return status;
} }
...@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, ...@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
ocfs2_data_unlock(inode, 1);
/* alright, we're going to need to do a full blown alloc size /* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the * change. Orphan the inode so that recovery can complete the
...@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, ...@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
status = ocfs2_commit_truncate(osb, inode, di_bh, tc); status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
/* TODO: orphan dir cleanup here. */ /* TODO: orphan dir cleanup here. */
bail_unlock_data:
ocfs2_data_unlock(inode, 1);
bail: bail:
mlog_exit(status); mlog_exit(status);
......
...@@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ...@@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
int status = 0; int status = 0;
struct ocfs2_truncate_context *tc = NULL; struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
handle_t *handle = NULL;
mlog_entry_void(); mlog_entry_void();
fe = (struct ocfs2_dinode *) fe_bh->b_data; fe = (struct ocfs2_dinode *) fe_bh->b_data;
if (fe->i_clusters) { if (fe->i_clusters) {
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out;
}
i_size_write(inode, 0);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
}
ocfs2_commit_trans(osb, handle);
handle = NULL;
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
...@@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ...@@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
goto out; goto out;
} }
} }
out:
out:
if (handle)
ocfs2_commit_trans(osb, handle);
mlog_exit(status); mlog_exit(status);
return status; return status;
} }
......
...@@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc ...@@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
return index; return index;
} }
static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
if (PAGE_CACHE_SHIFT < cbits)
pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
return pages_per_cluster;
}
#define ocfs2_set_bit ext2_set_bit #define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit #define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit #define ocfs2_test_bit ext2_test_bit
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment