Commit 363041a5 authored by Mark Fasheh's avatar Mark Fasheh

ocfs2: temporarily remove extent map caching

The code in extent_map.c is not prepared to deal with a subtree being
rotated between lookups. This can happen when filling holes in sparse files.
Instead of a lengthy patch to update the code (which would likely lose the
benefit of caching subtree roots), we remove most of the algorithms and
implement a simple path based lookup. A less ambitious extent caching scheme
will be added in a later patch.
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
parent dcd0538f
......@@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
*
* This function doesn't handle non btree extent lists.
*/
static int ocfs2_find_leaf(struct inode *inode,
struct ocfs2_extent_list *root_el, u32 cpos,
struct buffer_head **leaf_bh)
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh)
{
int ret;
struct buffer_head *bh = NULL;
......
......@@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh);
#endif /* OCFS2_ALLOC_H */
......@@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
if (err)
goto bail;
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
NULL);
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
......@@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
down_read(&OCFS2_I(inode)->ip_alloc_sem);
}
err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
NULL);
err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
......@@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks);
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
......
......@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
(sb->s_blocksize_bits - 9)),
1, &p_blkno, NULL);
&p_blkno, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
......
......@@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode,
* for the inode metadata. */
ocfs2_metadata_cache_purge(inode);
/* will do nothing for inode types that don't use the extent
* map (bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0);
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
......
......@@ -3,8 +3,7 @@
*
* extent_map.c
*
* In-memory extent map for OCFS2. Man, this code was prettier in
* the library.
* Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
......@@ -26,1016 +25,155 @@
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "buffer_head_io.h"
/*
* SUCK SUCK SUCK
* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
*/
struct ocfs2_extent_map_entry {
struct rb_node e_node;
int e_tree_depth;
struct ocfs2_extent_rec e_rec;
};
struct ocfs2_em_insert_context {
int need_left;
int need_right;
struct ocfs2_extent_map_entry *new_ent;
struct ocfs2_extent_map_entry *old_ent;
struct ocfs2_extent_map_entry *left_ent;
struct ocfs2_extent_map_entry *right_ent;
};
static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent);
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth);
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent);
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el);
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_map_entry **ret_ent);
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt);
/* returns 1 only if the rec contains all the given clusters -- that is that
* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
* clusters) is >= the argument's endpoint */
static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
u32 cpos, u32 clusters)
{
if (le32_to_cpu(rec->e_cpos) > cpos)
return 0;
if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters))
return 0;
return 1;
}
/*
* Find an entry in the tree that intersects the region passed in.
* Note that this will find straddled intervals, it is up to the
* callers to enforce any boundary conditions.
*
* Callers must hold ip_lock. This lookup is not guaranteed to return
* a tree_depth 0 match, and as such can race inserts if the lock
* were not held.
* Return the index of the extent record which contains cluster #v_cluster.
* -1 is returned if it was not found.
*
* The rb_node garbage lets insertion share the search. Trivial
* callers pass NULL.
* Should work fine on interior and exterior nodes.
*/
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent)
static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
u32 v_cluster)
{
struct rb_node **p = &em->em_extents.rb_node;
struct rb_node *parent = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
while (*p)
{
parent = *p;
ent = rb_entry(parent, struct ocfs2_extent_map_entry,
e_node);
if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
p = &(*p)->rb_left;
ent = NULL;
} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters))) {
p = &(*p)->rb_right;
ent = NULL;
} else
break;
}
if (ret_p != NULL)
*ret_p = p;
if (ret_parent != NULL)
*ret_parent = parent;
return ent;
}
/*
* Find the leaf containing the interval we want. While we're on our
* way down the tree, fill in every record we see at any depth, because
* we might want it later.
*
* Note that this code is run without ip_lock. That's because it
* sleeps while reading. If someone is also filling the extent list at
* the same time we are, we might have to restart.
*/
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el)
{
int i, ret;
struct buffer_head *eb_bh = NULL;
u64 blkno;
u32 rec_end;
struct ocfs2_extent_block *eb;
int ret = -1;
int i;
struct ocfs2_extent_rec *rec;
u32 rec_end, rec_start;
/*
* The bh data containing the el cannot change here, because
* we hold alloc_sem. So we can do this without other
* locks.
*/
while (el->l_tree_depth)
{
blkno = 0;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
rec_end = (le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ret = -EBADR;
if (rec_end > OCFS2_I(inode)->ip_clusters) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
i,
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
goto out_free;
}
if (rec_end <= cpos) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
/*
* We've found a record that matches our
* interval. We don't insert it because we're
* about to traverse it.
*/
/* Check to see if we're stradling */
ret = -ESRCH;
if (!ocfs2_extent_rec_contains_clusters(rec,
cpos,
clusters)) {
mlog_errno(ret);
goto out_free;
}
/*
* If we've already found a record, the el has
* two records covering the same interval.
* EEEK!
*/
ret = -EBADR;
if (blkno) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)blkno, i,
(unsigned long long)le64_to_cpu(rec->e_blkno));
goto out_free;
}
blkno = le64_to_cpu(rec->e_blkno);
}
/*
* We don't support holes, and we're still up
* in the branches, so we'd better have found someone
*/
ret = -EBADR;
if (!blkno) {
ocfs2_error(inode->i_sb,
"No record found for (cpos = %u, clusters = %u) on inode %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_errno(ret);
goto out_free;
}
if (eb_bh) {
brelse(eb_bh);
eb_bh = NULL;
}
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
blkno, &eb_bh, OCFS2_BH_CACHED,
inode);
if (ret) {
mlog_errno(ret);
goto out_free;
}
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EIO;
goto out_free;
}
el = &eb->h_list;
}
BUG_ON(el->l_tree_depth);
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
OCFS2_I(inode)->ip_clusters) {
ret = -EBADR;
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
i,
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
return ret;
}
rec_start = le32_to_cpu(rec->e_cpos);
rec_end = rec_start + le32_to_cpu(rec->e_clusters);
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
if (v_cluster >= rec_start && v_cluster < rec_end) {
ret = i;
break;
}
}
ret = 0;
out_free:
if (eb_bh)
brelse(eb_bh);
return ret;
}
/*
* This lookup actually will read from disk. It has one invariant:
* It will never re-traverse blocks. This means that all inserts should
* be new regions or more granular regions (both allowed by insert).
*/
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos,
u32 clusters,
struct ocfs2_extent_map_entry **ret_ent)
static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters)
{
int ret;
u64 blkno;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
struct buffer_head *bh = NULL;
struct ocfs2_extent_block *eb;
int ret, i;
struct buffer_head *di_bh = NULL;
struct buffer_head *eb_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
u32 coff;
spin_lock(&OCFS2_I(inode)->ip_lock);
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (ent) {
if (!ent->e_tree_depth) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
*ret_ent = ent;
return 0;
}
blkno = le64_to_cpu(ent->e_rec.e_blkno);
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
eb = (struct ocfs2_extent_block *)bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
brelse(bh);
return -EIO;
}
el = &eb->h_list;
} else {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
di = (struct ocfs2_dinode *)bh->b_data;
if (!OCFS2_IS_VALID_DINODE(di)) {
brelse(bh);
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
return -EIO;
}
el = &di->id2.i_list;
}
ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
brelse(bh);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
&di_bh, OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
return ret;
goto out;
}
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (!ent) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
/* FIXME: Make sure this isn't a corruption */
BUG_ON(ent->e_tree_depth);
di = (struct ocfs2_dinode *) di_bh->b_data;
el = &di->id2.i_list;
*ret_ent = ent;
return 0;
}
/*
* Callers must hold ip_lock. This can insert pieces of the tree,
* thus racing lookup if the lock weren't held.
*/
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent)
{
struct rb_node **p, *parent;
struct ocfs2_extent_map_entry *old_ent;
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
le32_to_cpu(ent->e_rec.e_clusters),
&p, &parent);
if (old_ent)
return -EEXIST;
rb_link_node(&ent->e_node, parent, p);
rb_insert_color(&ent->e_node, &em->em_extents);
return 0;
}
/*
* Simple rule: on any return code other than -EAGAIN, anything left
* in the insert_context will be freed.
*
* Simple rule #2: A return code of -EEXIST from this function or
* its calls to ocfs2_extent_map_insert_entry() signifies that another
* thread beat us to the insert. It is not an actual error, but it
* tells the caller we have no more work to do.
*/
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *old_ent;
ctxt->need_left = 0;
ctxt->need_right = 0;
ctxt->old_ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret) {
ctxt->new_ent = NULL;
goto out_unlock;
}
/* Since insert_entry failed, the map MUST have old_ent */
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
le32_to_cpu(rec->e_clusters),
NULL, NULL);
BUG_ON(!old_ent);
if (el->l_tree_depth) {
ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
if (old_ent->e_tree_depth < tree_depth) {
/* Another thread beat us to the lower tree_depth */
ret = -EEXIST;
goto out_unlock;
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
}
if (old_ent->e_tree_depth == tree_depth) {
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
* Another thread beat us to this tree_depth.
* Let's make sure we agree with that thread (the
* extent_rec should be identical).
* A hole was found. Return some canned values that
* callers can key on.
*/
if (!memcmp(rec, &old_ent->e_rec,
sizeof(struct ocfs2_extent_rec)))
ret = 0;
else
/* FIXME: Should this be ESRCH/EBADR??? */
ret = -EEXIST;
*p_cluster = 0;
if (num_clusters)
*num_clusters = 1;
} else {
rec = &el->l_recs[i];
goto out_unlock;
}
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
/*
* We do it in this order specifically so that no actual tree
* changes occur until we have all the pieces we need. We
* don't want malloc failures to leave an inconsistent tree.
* Whenever we drop the lock, another process could be
* inserting. Also note that, if another process just beat us
* to an insert, we might not need the same pieces we needed
* the first go round. In the end, the pieces we need will
* be used, and the pieces we don't will be freed.
*/
ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
le32_to_cpu(old_ent->e_rec.e_cpos));
ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) >
(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
ret = -EAGAIN;
if (ctxt->need_left) {
if (!ctxt->left_ent)
goto out_unlock;
*(ctxt->left_ent) = *old_ent;
ctxt->left_ent->e_rec.e_clusters =
cpu_to_le32(le32_to_cpu(rec->e_cpos) -
le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
}
if (ctxt->need_right) {
if (!ctxt->right_ent)
goto out_unlock;
*(ctxt->right_ent) = *old_ent;
ctxt->right_ent->e_rec.e_cpos =
cpu_to_le32(le32_to_cpu(rec->e_cpos) +
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0)", inode->i_ino,
le32_to_cpu(rec->e_cpos),
le32_to_cpu(rec->e_clusters));
ctxt->right_ent->e_rec.e_clusters =
cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) -
le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
}
rb_erase(&old_ent->e_node, &em->em_extents);
/* Now that he's erased, set him up for deletion */
ctxt->old_ent = old_ent;
if (ctxt->need_left) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->left_ent);
if (ret)
goto out_unlock;
ctxt->left_ent = NULL;
}
if (ctxt->need_right) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->right_ent);
if (ret)
goto out_unlock;
ctxt->right_ent = NULL;
}
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret)
ctxt->new_ent = NULL;
out_unlock:
spin_unlock(&OCFS2_I(inode)->ip_lock);
return ret;
}
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth)
{
int ret;
struct ocfs2_em_insert_context ctxt = {0, };
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
return ret;
}
/* Zero e_clusters means a truncated tail record. It better be EOF */
if (!rec->e_clusters) {
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return ret;
}
/* Ignore the truncated tail */
return 0;
}
ret = -ENOMEM;
ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.new_ent) {
mlog_errno(ret);
return ret;
}
ctxt.new_ent->e_rec = *rec;
ctxt.new_ent->e_tree_depth = tree_depth;
do {
ret = -ENOMEM;
if (ctxt.need_left && !ctxt.left_ent) {
ctxt.left_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.left_ent)
break;
}
if (ctxt.need_right && !ctxt.right_ent) {
ctxt.right_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.right_ent)
break;
ret = -EROFS;
goto out;
}
ret = ocfs2_extent_map_try_insert(inode, rec,
tree_depth, &ctxt);
} while (ret == -EAGAIN);
coff = v_cluster - le32_to_cpu(rec->e_cpos);
if ((ret < 0) && (ret != -EEXIST))
mlog_errno(ret);
if (ctxt.left_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
if (ctxt.right_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
if (ctxt.old_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
if (ctxt.new_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
return ret;
}
*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
/*
* Append this record to the tail of the extent map. It must be
* tree_depth 0. The record might be an extension of an existing
* record, and as such that needs to be handled. eg:
*
* Existing record in the extent map:
*
* cpos = 10, len = 10
* |---------|
*
* New Record:
*
* cpos = 10, len = 20
* |------------------|
*
* The passed record is the new on-disk record. The new_clusters value
* is how many clusters were added to the file. If the append is a
* contiguous append, the new_clusters has been added to
* rec->e_clusters. If the append is an entirely new extent, then
* rec->e_clusters is == new_clusters.
*/
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
struct ocfs2_extent_rec *old;
BUG_ON(!new_clusters);
BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
if (num_clusters)
*num_clusters = le32_to_cpu(rec->e_clusters) - coff;
}
mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(em->em_clusters + new_clusters),
"Inode %llu:\n"
"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
"em->em_clusters = %u + new_clusters = %u = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
em->em_clusters, new_clusters,
em->em_clusters + new_clusters);
em->em_clusters += new_clusters;
ret = -ENOENT;
if (le32_to_cpu(rec->e_clusters) > new_clusters) {
/* This is a contiguous append */
ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
NULL, NULL);
if (ent) {
old = &ent->e_rec;
BUG_ON((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(le32_to_cpu(old->e_cpos) +
le32_to_cpu(old->e_clusters) +
new_clusters));
if (ent->e_tree_depth == 0) {
BUG_ON(le32_to_cpu(old->e_cpos) !=
le32_to_cpu(rec->e_cpos));
BUG_ON(le64_to_cpu(old->e_blkno) !=
le64_to_cpu(rec->e_blkno));
ret = 0;
}
/*
* Let non-leafs fall through as -ENOENT to
* force insertion of the new leaf.
*/
le32_add_cpu(&old->e_clusters, new_clusters);
}
}
if (ret == -ENOENT)
ret = ocfs2_extent_map_insert(inode, rec, 0);
if (ret < 0)
mlog_errno(ret);
out:
brelse(di_bh);
brelse(eb_bh);
return ret;
}
#if 0
/* Code here is included but defined out as it completes the extent
* map api and may be used in the future. */
/*
* Look up the record containing this cluster offset. This record is
* part of the extent map. Do not free it. Any changes you make to
* it will reflect in the extent map. So, if your last extent
* is (cpos = 10, clusters = 10) and you truncate the file by 5
* clusters, you can do:
*
* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
* rec->e_clusters -= 5;
*
* The lookup does not read from disk. If the map isn't filled in for
* an entry, you won't find it.
*
* Also note that the returned record is valid until alloc_sem is
* dropped. After that, truncate and extend can happen. Caveat Emptor.
* This expects alloc_sem to be held. The allocation cannot change at
* all while the map is in the process of being updated.
*/
int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
struct ocfs2_extent_rec **rec,
int *tree_depth)
{
int ret = -ENOENT;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*rec = NULL;
if (cpos >= OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if (cpos >= em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters ;
}
ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
NULL, NULL);
if (ent) {
*rec = &ent->e_rec;
if (tree_depth)
*tree_depth = ent->e_tree_depth;
ret = 0;
}
return ret;
}
int ocfs2_extent_map_get_clusters(struct inode *inode,
u32 v_cpos, int count,
u32 *p_cpos, int *ret_count)
{
int ret;
u32 coff, ccount;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent = NULL;
*p_cpos = ccount = 0;
if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if ((v_cpos + count) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
if (ret)
return ret;
if (ent) {
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
v_cpos,
count))
return -ESRCH;
coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(ent->e_rec.e_blkno)) +
coff;
if (ret_count)
*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
return 0;
}
return -ENOENT;
}
#endif /* 0 */
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count)
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int *ret_count)
{
int ret;
u64 boff;
u32 cpos, clusters;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
struct ocfs2_extent_map_entry *ent = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_rec *rec;
*p_blkno = 0;
u32 cpos, num_clusters, p_cluster;
u64 boff = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
clusters = ocfs2_blocks_to_clusters(inode->i_sb,
(u64)count + bpc - 1);
if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
ret = -EINVAL;
mlog_errno(ret);
return ret;
}
if ((cpos + clusters) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters);
if (ret) {
mlog_errno(ret);
return ret;
goto out;
}
if (ent)
{
rec = &ent->e_rec;
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
le32_to_cpu(rec->e_cpos));
/*
* p_cluster == 0 indicates a hole.
*/
if (p_cluster) {
boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
boff += (v_blkno & (u64)(bpc - 1));
*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(rec->e_clusters)) - boff;
}
return 0;
}
return -ENOENT;
}
int ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
*p_blkno = boff;
em->em_extents = RB_ROOT;
em->em_clusters = 0;
return 0;
}
/* Needs the lock */
static void __ocfs2_extent_map_drop(struct inode *inode,
u32 new_clusters,
struct rb_node **free_head,
struct ocfs2_extent_map_entry **tail_ent)
{
struct rb_node *node, *next;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*free_head = NULL;
ent = NULL;
node = rb_last(&em->em_extents);
while (node)
{
next = rb_prev(node);
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
break;
rb_erase(&ent->e_node, &em->em_extents);
node->rb_right = *free_head;
*free_head = node;
ent = NULL;
node = next;
}
/* Do we have an entry straddling new_clusters? */
if (tail_ent) {
if (ent &&
((le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
*tail_ent = ent;
else
*tail_ent = NULL;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
*ret_count -= v_blkno & (u64)(bpc - 1);
}
}
static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
{
struct rb_node *node;
struct ocfs2_extent_map_entry *ent;
while (free_head) {
node = free_head;
free_head = node->rb_right;
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
kmem_cache_free(ocfs2_em_ent_cachep, ent);
}
}
/*
* Remove all entries past new_clusters, inclusive of an entry that
* contains new_clusters. This is effectively a cache forget.
*
* If you want to also clip the last extent by some number of clusters,
* you need to call ocfs2_extent_map_trunc().
* This code does not check or modify ip_clusters.
*/
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent) {
rb_erase(&ent->e_node, &em->em_extents);
ent->e_node.rb_right = free_head;
free_head = &ent->e_node;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
/*
* Remove all entries past new_clusters and also clip any extent
* straddling new_clusters, if there is one. This does not check
* or modify ip_clusters
*/
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent)
ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
le32_to_cpu(ent->e_rec.e_cpos));
OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
int __init init_ocfs2_extent_maps(void)
{
ocfs2_em_ent_cachep =
kmem_cache_create("ocfs2_em_ent",
sizeof(struct ocfs2_extent_map_entry),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ocfs2_em_ent_cachep)
return -ENOMEM;
return 0;
}
void exit_ocfs2_extent_maps(void)
{
kmem_cache_destroy(ocfs2_em_ent_cachep);
out:
return ret;
}
......@@ -25,22 +25,7 @@
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
int init_ocfs2_extent_maps(void);
void exit_ocfs2_extent_maps(void);
/*
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
* to be held. The allocation cannot change at all while the map is
* in the process of being updated.
*/
int ocfs2_extent_map_init(struct inode *inode);
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count);
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int *ret_count);
#endif /* _EXTENT_MAP_H */
......@@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
ocfs2_extent_map_drop(inode, 0);
ocfs2_extent_map_init(inode);
status = ocfs2_drop_inode_locks(inode);
if (status < 0)
mlog_errno(status);
......@@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
return NULL;
}
tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
&p_blkno, NULL);
tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
if (tmperr < 0) {
mlog_errno(tmperr);
goto fail;
......
......@@ -43,7 +43,6 @@ struct ocfs2_inode_info
spinlock_t ip_lock;
u32 ip_open_count;
u32 ip_clusters;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
struct mutex ip_io_mutex;
......
......@@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
1, &p_blkno,
&p_blocks);
&p_blkno, &p_blocks);
if (status < 0) {
mlog_errno(status);
goto bail;
......
......@@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail;
}
status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
&p_blocks);
status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks);
if (status < 0) {
mlog_errno(status);
goto bail;
......
......@@ -46,11 +46,6 @@
#include "endian.h"
#include "ocfs2_lockid.h"
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
};
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
......
......@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
......
......@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
ocfs2_print_version();
if (init_ocfs2_extent_maps())
return -ENOMEM;
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
......@@ -837,7 +834,6 @@ static int __init ocfs2_init(void)
if (status < 0) {
ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache();
exit_ocfs2_extent_maps();
}
mlog_exit(status);
......@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_extent_maps();
exit_ocfs2_uptodate_cache();
mlog_exit_void();
......@@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data,
oi->ip_flags = 0;
oi->ip_open_count = 0;
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_created_trans = 0;
oi->ip_last_trans = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment