Commit 363041a5 authored by Mark Fasheh's avatar Mark Fasheh

ocfs2: temporarily remove extent map caching

The code in extent_map.c is not prepared to deal with a subtree being
rotated between lookups. This can happen when filling holes in sparse files.
Instead of a lengthy patch to update the code (which would likely lose the
benefit of caching subtree roots), we remove most of the algorithms and
implement a simple path based lookup. A less ambitious extent caching scheme
will be added in a later patch.
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
parent dcd0538f
...@@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) ...@@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
* *
* This function doesn't handle non btree extent lists. * This function doesn't handle non btree extent lists.
*/ */
static int ocfs2_find_leaf(struct inode *inode, int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
struct ocfs2_extent_list *root_el, u32 cpos, u32 cpos, struct buffer_head **leaf_bh)
struct buffer_head **leaf_bh)
{ {
int ret; int ret;
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
......
...@@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc); struct ocfs2_truncate_context *tc);
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh);
#endif /* OCFS2_ALLOC_H */ #endif /* OCFS2_ALLOC_H */
...@@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, ...@@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
if (err) if (err)
goto bail; goto bail;
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
NULL);
if (err) { if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%llu, NULL)\n", err, inode, (unsigned long long)iblock, "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
...@@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) ...@@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
down_read(&OCFS2_I(inode)->ip_alloc_sem); down_read(&OCFS2_I(inode)->ip_alloc_sem);
} }
err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
NULL);
if (!INODE_JOURNAL(inode)) { if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem); up_read(&OCFS2_I(inode)->ip_alloc_sem);
...@@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ...@@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* This figures out the size of the next contiguous block, and /* This figures out the size of the next contiguous block, and
* our logical offset */ * our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks); &contig_blocks);
if (ret) { if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
......
...@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb, ...@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
(sb->s_blocksize_bits - 9)), (sb->s_blocksize_bits - 9)),
1, &p_blkno, NULL); &p_blkno, NULL);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode, ...@@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode,
* for the inode metadata. */ * for the inode metadata. */
ocfs2_metadata_cache_purge(inode); ocfs2_metadata_cache_purge(inode);
/* will do nothing for inode types that don't use the extent
* map (bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0);
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n", mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno); (unsigned long long)oi->ip_blkno);
......
...@@ -3,8 +3,7 @@ ...@@ -3,8 +3,7 @@
* *
* extent_map.c * extent_map.c
* *
* In-memory extent map for OCFS2. Man, this code was prettier in * Block/Cluster mapping functions
* the library.
* *
* Copyright (C) 2004 Oracle. All rights reserved. * Copyright (C) 2004 Oracle. All rights reserved.
* *
...@@ -26,1016 +25,155 @@ ...@@ -26,1016 +25,155 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP #define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h> #include <cluster/masklog.h>
#include "ocfs2.h" #include "ocfs2.h"
#include "alloc.h"
#include "extent_map.h" #include "extent_map.h"
#include "inode.h" #include "inode.h"
#include "super.h" #include "super.h"
#include "buffer_head_io.h" #include "buffer_head_io.h"
/*
* SUCK SUCK SUCK
* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
*/
struct ocfs2_extent_map_entry {
struct rb_node e_node;
int e_tree_depth;
struct ocfs2_extent_rec e_rec;
};
struct ocfs2_em_insert_context {
int need_left;
int need_right;
struct ocfs2_extent_map_entry *new_ent;
struct ocfs2_extent_map_entry *old_ent;
struct ocfs2_extent_map_entry *left_ent;
struct ocfs2_extent_map_entry *right_ent;
};
static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent);
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth);
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent);
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el);
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_map_entry **ret_ent);
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt);
/* returns 1 only if the rec contains all the given clusters -- that is that
* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
* clusters) is >= the argument's endpoint */
static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
u32 cpos, u32 clusters)
{
if (le32_to_cpu(rec->e_cpos) > cpos)
return 0;
if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters))
return 0;
return 1;
}
/* /*
* Find an entry in the tree that intersects the region passed in. * Return the index of the extent record which contains cluster #v_cluster.
* Note that this will find straddled intervals, it is up to the * -1 is returned if it was not found.
* callers to enforce any boundary conditions.
* *
* Callers must hold ip_lock. This lookup is not guaranteed to return * Should work fine on interior and exterior nodes.
* a tree_depth 0 match, and as such can race inserts if the lock
* were not held.
*
* The rb_node garbage lets insertion share the search. Trivial
* callers pass NULL.
*/ */
static struct ocfs2_extent_map_entry * static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, u32 v_cluster)
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent)
{ {
struct rb_node **p = &em->em_extents.rb_node; int ret = -1;
struct rb_node *parent = NULL; int i;
struct ocfs2_extent_map_entry *ent = NULL;
while (*p)
{
parent = *p;
ent = rb_entry(parent, struct ocfs2_extent_map_entry,
e_node);
if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
p = &(*p)->rb_left;
ent = NULL;
} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters))) {
p = &(*p)->rb_right;
ent = NULL;
} else
break;
}
if (ret_p != NULL)
*ret_p = p;
if (ret_parent != NULL)
*ret_parent = parent;
return ent;
}
/*
* Find the leaf containing the interval we want. While we're on our
* way down the tree, fill in every record we see at any depth, because
* we might want it later.
*
* Note that this code is run without ip_lock. That's because it
* sleeps while reading. If someone is also filling the extent list at
* the same time we are, we might have to restart.
*/
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el)
{
int i, ret;
struct buffer_head *eb_bh = NULL;
u64 blkno;
u32 rec_end;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec; struct ocfs2_extent_rec *rec;
u32 rec_end, rec_start;
/* for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
* The bh data containing the el cannot change here, because
* we hold alloc_sem. So we can do this without other
* locks.
*/
while (el->l_tree_depth)
{
blkno = 0;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
rec_end = (le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ret = -EBADR;
if (rec_end > OCFS2_I(inode)->ip_clusters) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
i,
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
goto out_free;
}
if (rec_end <= cpos) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
/*
* We've found a record that matches our
* interval. We don't insert it because we're
* about to traverse it.
*/
/* Check to see if we're stradling */
ret = -ESRCH;
if (!ocfs2_extent_rec_contains_clusters(rec,
cpos,
clusters)) {
mlog_errno(ret);
goto out_free;
}
/*
* If we've already found a record, the el has
* two records covering the same interval.
* EEEK!
*/
ret = -EBADR;
if (blkno) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)blkno, i,
(unsigned long long)le64_to_cpu(rec->e_blkno));
goto out_free;
}
blkno = le64_to_cpu(rec->e_blkno);
}
/*
* We don't support holes, and we're still up
* in the branches, so we'd better have found someone
*/
ret = -EBADR;
if (!blkno) {
ocfs2_error(inode->i_sb,
"No record found for (cpos = %u, clusters = %u) on inode %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_errno(ret);
goto out_free;
}
if (eb_bh) {
brelse(eb_bh);
eb_bh = NULL;
}
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
blkno, &eb_bh, OCFS2_BH_CACHED,
inode);
if (ret) {
mlog_errno(ret);
goto out_free;
}
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EIO;
goto out_free;
}
el = &eb->h_list;
}
BUG_ON(el->l_tree_depth);
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i]; rec = &el->l_recs[i];
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > rec_start = le32_to_cpu(rec->e_cpos);
OCFS2_I(inode)->ip_clusters) { rec_end = rec_start + le32_to_cpu(rec->e_clusters);
ret = -EBADR;
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
i,
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
return ret;
}
ret = ocfs2_extent_map_insert(inode, rec, if (v_cluster >= rec_start && v_cluster < rec_end) {
le16_to_cpu(el->l_tree_depth)); ret = i;
if (ret && (ret != -EEXIST)) { break;
mlog_errno(ret);
goto out_free;
} }
} }
ret = 0;
out_free:
if (eb_bh)
brelse(eb_bh);
return ret; return ret;
} }
/* static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
* This lookup actually will read from disk. It has one invariant: u32 *p_cluster, u32 *num_clusters)
* It will never re-traverse blocks. This means that all inserts should
* be new regions or more granular regions (both allowed by insert).
*/
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos,
u32 clusters,
struct ocfs2_extent_map_entry **ret_ent)
{ {
int ret; int ret, i;
u64 blkno; struct buffer_head *di_bh = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_map_entry *ent;
struct buffer_head *bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_dinode *di; struct ocfs2_dinode *di;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
u32 coff;
spin_lock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); &di_bh, OCFS2_BH_CACHED, inode);
if (ent) {
if (!ent->e_tree_depth) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
*ret_ent = ent;
return 0;
}
blkno = le64_to_cpu(ent->e_rec.e_blkno);
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) { if (ret) {
mlog_errno(ret); mlog_errno(ret);
if (bh) goto out;
brelse(bh);
return ret;
}
eb = (struct ocfs2_extent_block *)bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
brelse(bh);
return -EIO;
} }
el = &eb->h_list;
} else {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), di = (struct ocfs2_dinode *) di_bh->b_data;
OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
di = (struct ocfs2_dinode *)bh->b_data;
if (!OCFS2_IS_VALID_DINODE(di)) {
brelse(bh);
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
return -EIO;
}
el = &di->id2.i_list; el = &di->id2.i_list;
}
ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); if (el->l_tree_depth) {
brelse(bh); ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
if (ret) { if (ret) {
mlog_errno(ret); mlog_errno(ret);
return ret; goto out;
}
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (!ent) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
/* FIXME: Make sure this isn't a corruption */
BUG_ON(ent->e_tree_depth);
*ret_ent = ent;
return 0;
}
/*
* Callers must hold ip_lock. This can insert pieces of the tree,
* thus racing lookup if the lock weren't held.
*/
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent)
{
struct rb_node **p, *parent;
struct ocfs2_extent_map_entry *old_ent;
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
le32_to_cpu(ent->e_rec.e_clusters),
&p, &parent);
if (old_ent)
return -EEXIST;
rb_link_node(&ent->e_node, parent, p);
rb_insert_color(&ent->e_node, &em->em_extents);
return 0;
}
/*
* Simple rule: on any return code other than -EAGAIN, anything left
* in the insert_context will be freed.
*
* Simple rule #2: A return code of -EEXIST from this function or
* its calls to ocfs2_extent_map_insert_entry() signifies that another
* thread beat us to the insert. It is not an actual error, but it
* tells the caller we have no more work to do.
*/
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *old_ent;
ctxt->need_left = 0;
ctxt->need_right = 0;
ctxt->old_ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret) {
ctxt->new_ent = NULL;
goto out_unlock;
} }
/* Since insert_entry failed, the map MUST have old_ent */ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), el = &eb->h_list;
le32_to_cpu(rec->e_clusters),
NULL, NULL);
BUG_ON(!old_ent);
if (old_ent->e_tree_depth < tree_depth) {
/* Another thread beat us to the lower tree_depth */
ret = -EEXIST;
goto out_unlock;
} }
if (old_ent->e_tree_depth == tree_depth) { i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/* /*
* Another thread beat us to this tree_depth. * A hole was found. Return some canned values that
* Let's make sure we agree with that thread (the * callers can key on.
* extent_rec should be identical).
*/ */
if (!memcmp(rec, &old_ent->e_rec, *p_cluster = 0;
sizeof(struct ocfs2_extent_rec))) if (num_clusters)
ret = 0; *num_clusters = 1;
else } else {
/* FIXME: Should this be ESRCH/EBADR??? */ rec = &el->l_recs[i];
ret = -EEXIST;
goto out_unlock; BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
}
/* if (!rec->e_blkno) {
* We do it in this order specifically so that no actual tree ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
* changes occur until we have all the pieces we need. We "record (%u, %u, 0)", inode->i_ino,
* don't want malloc failures to leave an inconsistent tree. le32_to_cpu(rec->e_cpos),
* Whenever we drop the lock, another process could be
* inserting. Also note that, if another process just beat us
* to an insert, we might not need the same pieces we needed
* the first go round. In the end, the pieces we need will
* be used, and the pieces we don't will be freed.
*/
ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
le32_to_cpu(old_ent->e_rec.e_cpos));
ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) >
(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
ret = -EAGAIN;
if (ctxt->need_left) {
if (!ctxt->left_ent)
goto out_unlock;
*(ctxt->left_ent) = *old_ent;
ctxt->left_ent->e_rec.e_clusters =
cpu_to_le32(le32_to_cpu(rec->e_cpos) -
le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
}
if (ctxt->need_right) {
if (!ctxt->right_ent)
goto out_unlock;
*(ctxt->right_ent) = *old_ent;
ctxt->right_ent->e_rec.e_cpos =
cpu_to_le32(le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)); le32_to_cpu(rec->e_clusters));
ctxt->right_ent->e_rec.e_clusters = ret = -EROFS;
cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + goto out;
le32_to_cpu(old_ent->e_rec.e_clusters)) -
le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
}
rb_erase(&old_ent->e_node, &em->em_extents);
/* Now that he's erased, set him up for deletion */
ctxt->old_ent = old_ent;
if (ctxt->need_left) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->left_ent);
if (ret)
goto out_unlock;
ctxt->left_ent = NULL;
}
if (ctxt->need_right) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->right_ent);
if (ret)
goto out_unlock;
ctxt->right_ent = NULL;
}
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret)
ctxt->new_ent = NULL;
out_unlock:
spin_unlock(&OCFS2_I(inode)->ip_lock);
return ret;
}
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth)
{
int ret;
struct ocfs2_em_insert_context ctxt = {0, };
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
return ret;
}
/* Zero e_clusters means a truncated tail record. It better be EOF */
if (!rec->e_clusters) {
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return ret;
}
/* Ignore the truncated tail */
return 0;
}
ret = -ENOMEM;
ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.new_ent) {
mlog_errno(ret);
return ret;
}
ctxt.new_ent->e_rec = *rec;
ctxt.new_ent->e_tree_depth = tree_depth;
do {
ret = -ENOMEM;
if (ctxt.need_left && !ctxt.left_ent) {
ctxt.left_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.left_ent)
break;
}
if (ctxt.need_right && !ctxt.right_ent) {
ctxt.right_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.right_ent)
break;
} }
ret = ocfs2_extent_map_try_insert(inode, rec, coff = v_cluster - le32_to_cpu(rec->e_cpos);
tree_depth, &ctxt);
} while (ret == -EAGAIN);
if ((ret < 0) && (ret != -EEXIST))
mlog_errno(ret);
if (ctxt.left_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
if (ctxt.right_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
if (ctxt.old_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
if (ctxt.new_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
return ret;
}
/* *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
* Append this record to the tail of the extent map. It must be
* tree_depth 0. The record might be an extension of an existing
* record, and as such that needs to be handled. eg:
*
* Existing record in the extent map:
*
* cpos = 10, len = 10
* |---------|
*
* New Record:
*
* cpos = 10, len = 20
* |------------------|
*
* The passed record is the new on-disk record. The new_clusters value
* is how many clusters were added to the file. If the append is a
* contiguous append, the new_clusters has been added to
* rec->e_clusters. If the append is an entirely new extent, then
* rec->e_clusters is == new_clusters.
*/
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
struct ocfs2_extent_rec *old;
BUG_ON(!new_clusters);
BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(em->em_clusters + new_clusters),
"Inode %llu:\n"
"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
"em->em_clusters = %u + new_clusters = %u = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
em->em_clusters, new_clusters,
em->em_clusters + new_clusters);
em->em_clusters += new_clusters;
ret = -ENOENT;
if (le32_to_cpu(rec->e_clusters) > new_clusters) {
/* This is a contiguous append */
ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
NULL, NULL);
if (ent) {
old = &ent->e_rec;
BUG_ON((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(le32_to_cpu(old->e_cpos) +
le32_to_cpu(old->e_clusters) +
new_clusters));
if (ent->e_tree_depth == 0) {
BUG_ON(le32_to_cpu(old->e_cpos) !=
le32_to_cpu(rec->e_cpos));
BUG_ON(le64_to_cpu(old->e_blkno) !=
le64_to_cpu(rec->e_blkno)); le64_to_cpu(rec->e_blkno));
ret = 0; *p_cluster = *p_cluster + coff;
}
/*
* Let non-leafs fall through as -ENOENT to
* force insertion of the new leaf.
*/
le32_add_cpu(&old->e_clusters, new_clusters);
}
}
if (ret == -ENOENT) if (num_clusters)
ret = ocfs2_extent_map_insert(inode, rec, 0); *num_clusters = le32_to_cpu(rec->e_clusters) - coff;
if (ret < 0)
mlog_errno(ret);
return ret;
}
#if 0
/* Code here is included but defined out as it completes the extent
* map api and may be used in the future. */
/*
* Look up the record containing this cluster offset. This record is
* part of the extent map. Do not free it. Any changes you make to
* it will reflect in the extent map. So, if your last extent
* is (cpos = 10, clusters = 10) and you truncate the file by 5
* clusters, you can do:
*
* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
* rec->e_clusters -= 5;
*
* The lookup does not read from disk. If the map isn't filled in for
* an entry, you won't find it.
*
* Also note that the returned record is valid until alloc_sem is
* dropped. After that, truncate and extend can happen. Caveat Emptor.
*/
int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
struct ocfs2_extent_rec **rec,
int *tree_depth)
{
int ret = -ENOENT;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*rec = NULL;
if (cpos >= OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if (cpos >= em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters ;
}
ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
NULL, NULL);
if (ent) {
*rec = &ent->e_rec;
if (tree_depth)
*tree_depth = ent->e_tree_depth;
ret = 0;
} }
out:
brelse(di_bh);
brelse(eb_bh);
return ret; return ret;
} }
int ocfs2_extent_map_get_clusters(struct inode *inode, /*
u32 v_cpos, int count, * This expects alloc_sem to be held. The allocation cannot change at
u32 *p_cpos, int *ret_count) * all while the map is in the process of being updated.
{
int ret;
u32 coff, ccount;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent = NULL;
*p_cpos = ccount = 0;
if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if ((v_cpos + count) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/ */
ocfs2_extent_map_drop(inode, em->em_clusters - 1); int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
em->em_clusters = OCFS2_I(inode)->ip_clusters; int *ret_count)
}
ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
if (ret)
return ret;
if (ent) {
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
v_cpos,
count))
return -ESRCH;
coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(ent->e_rec.e_blkno)) +
coff;
if (ret_count)
*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
return 0;
}
return -ENOENT;
}
#endif /* 0 */
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count)
{ {
int ret; int ret;
u64 boff;
u32 cpos, clusters;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
struct ocfs2_extent_map_entry *ent = NULL; u32 cpos, num_clusters, p_cluster;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; u64 boff = 0;
struct ocfs2_extent_rec *rec;
*p_blkno = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
clusters = ocfs2_blocks_to_clusters(inode->i_sb,
(u64)count + bpc - 1);
if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
ret = -EINVAL;
mlog_errno(ret);
return ret;
}
if ((cpos + clusters) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters);
if (ret) { if (ret) {
mlog_errno(ret); mlog_errno(ret);
return ret; goto out;
}
if (ent)
{
rec = &ent->e_rec;
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
} }
boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - /*
le32_to_cpu(rec->e_cpos)); * p_cluster == 0 indicates a hole.
*/
if (p_cluster) {
boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
boff += (v_blkno & (u64)(bpc - 1)); boff += (v_blkno & (u64)(bpc - 1));
*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(rec->e_clusters)) - boff;
}
return 0;
}
return -ENOENT;
}
int ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
em->em_extents = RB_ROOT;
em->em_clusters = 0;
return 0;
}
/* Needs the lock */
static void __ocfs2_extent_map_drop(struct inode *inode,
u32 new_clusters,
struct rb_node **free_head,
struct ocfs2_extent_map_entry **tail_ent)
{
struct rb_node *node, *next;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*free_head = NULL;
ent = NULL;
node = rb_last(&em->em_extents);
while (node)
{
next = rb_prev(node);
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
break;
rb_erase(&ent->e_node, &em->em_extents);
node->rb_right = *free_head;
*free_head = node;
ent = NULL;
node = next;
}
/* Do we have an entry straddling new_clusters? */
if (tail_ent) {
if (ent &&
((le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
*tail_ent = ent;
else
*tail_ent = NULL;
}
}
static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
{
struct rb_node *node;
struct ocfs2_extent_map_entry *ent;
while (free_head) {
node = free_head;
free_head = node->rb_right;
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
kmem_cache_free(ocfs2_em_ent_cachep, ent);
} }
}
/* *p_blkno = boff;
* Remove all entries past new_clusters, inclusive of an entry that
* contains new_clusters. This is effectively a cache forget.
*
* If you want to also clip the last extent by some number of clusters,
* you need to call ocfs2_extent_map_trunc().
* This code does not check or modify ip_clusters.
*/
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
if (ent) { *ret_count -= v_blkno & (u64)(bpc - 1);
rb_erase(&ent->e_node, &em->em_extents);
ent->e_node.rb_right = free_head;
free_head = &ent->e_node;
} }
spin_unlock(&OCFS2_I(inode)->ip_lock); out:
return ret;
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
/*
* Remove all entries past new_clusters and also clip any extent
* straddling new_clusters, if there is one. This does not check
* or modify ip_clusters
*/
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent)
ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
le32_to_cpu(ent->e_rec.e_cpos));
OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
int __init init_ocfs2_extent_maps(void)
{
ocfs2_em_ent_cachep =
kmem_cache_create("ocfs2_em_ent",
sizeof(struct ocfs2_extent_map_entry),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ocfs2_em_ent_cachep)
return -ENOMEM;
return 0;
}
void exit_ocfs2_extent_maps(void)
{
kmem_cache_destroy(ocfs2_em_ent_cachep);
} }
...@@ -25,22 +25,7 @@ ...@@ -25,22 +25,7 @@
#ifndef _EXTENT_MAP_H #ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H #define _EXTENT_MAP_H
int init_ocfs2_extent_maps(void); int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
void exit_ocfs2_extent_maps(void); int *ret_count);
/*
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
* to be held. The allocation cannot change at all while the map is
* in the process of being updated.
*/
int ocfs2_extent_map_init(struct inode *inode);
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count);
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
#endif /* _EXTENT_MAP_H */ #endif /* _EXTENT_MAP_H */
...@@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode) ...@@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n", "Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno); (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_drop(inode, 0);
ocfs2_extent_map_init(inode);
status = ocfs2_drop_inode_locks(inode); status = ocfs2_drop_inode_locks(inode);
if (status < 0) if (status < 0)
mlog_errno(status); mlog_errno(status);
...@@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode, ...@@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
return NULL; return NULL;
} }
tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
&p_blkno, NULL);
if (tmperr < 0) { if (tmperr < 0) {
mlog_errno(tmperr); mlog_errno(tmperr);
goto fail; goto fail;
......
...@@ -43,7 +43,6 @@ struct ocfs2_inode_info ...@@ -43,7 +43,6 @@ struct ocfs2_inode_info
spinlock_t ip_lock; spinlock_t ip_lock;
u32 ip_open_count; u32 ip_open_count;
u32 ip_clusters; u32 ip_clusters;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers; struct list_head ip_io_markers;
struct mutex ip_io_mutex; struct mutex ip_io_mutex;
......
...@@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode) ...@@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno, status = ocfs2_extent_map_get_blocks(inode, v_blkno,
1, &p_blkno, &p_blkno, &p_blocks);
&p_blocks);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, ...@@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail; goto bail;
} }
status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks);
&p_blocks);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -46,11 +46,6 @@ ...@@ -46,11 +46,6 @@
#include "endian.h" #include "endian.h"
#include "ocfs2_lockid.h" #include "ocfs2_lockid.h"
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
};
/* Most user visible OCFS2 inodes will have very few pieces of /* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken * metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small * into account when designing an access scheme. We allow a small
......
...@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) ...@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail; goto bail;
} }
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -806,9 +806,6 @@ static int __init ocfs2_init(void) ...@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
ocfs2_print_version(); ocfs2_print_version();
if (init_ocfs2_extent_maps())
return -ENOMEM;
status = init_ocfs2_uptodate_cache(); status = init_ocfs2_uptodate_cache();
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
...@@ -837,7 +834,6 @@ static int __init ocfs2_init(void) ...@@ -837,7 +834,6 @@ static int __init ocfs2_init(void)
if (status < 0) { if (status < 0) {
ocfs2_free_mem_caches(); ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache(); exit_ocfs2_uptodate_cache();
exit_ocfs2_extent_maps();
} }
mlog_exit(status); mlog_exit(status);
...@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) ...@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type); unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_extent_maps();
exit_ocfs2_uptodate_cache(); exit_ocfs2_uptodate_cache();
mlog_exit_void(); mlog_exit_void();
...@@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data, ...@@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data,
oi->ip_flags = 0; oi->ip_flags = 0;
oi->ip_open_count = 0; oi->ip_open_count = 0;
spin_lock_init(&oi->ip_lock); spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers); INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_created_trans = 0; oi->ip_created_trans = 0;
oi->ip_last_trans = 0; oi->ip_last_trans = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment