Commit a74b81b0 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2

* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (28 commits)
  Ocfs2: Teach local-mounted ocfs2 to handle unwritten_extents correctly.
  ocfs2/dlm: Do not migrate resource to a node that is leaving the domain
  ocfs2/dlm: Add new dlm message DLM_BEGIN_EXIT_DOMAIN_MSG
  Ocfs2/move_extents: Set several trivial constraints for threshold.
  Ocfs2/move_extents: Let defrag handle partial extent moving.
  Ocfs2/move_extents: move/defrag extents within a certain range.
  Ocfs2/move_extents: helper to calculate the defraging length in one run.
  Ocfs2/move_extents: move entire/partial extent.
  Ocfs2/move_extents: helpers to update the group descriptor and global bitmap inode.
  Ocfs2/move_extents: helper to probe a proper region to move in an alloc group.
  Ocfs2/move_extents: helper to validate and adjust moving goal.
  Ocfs2/move_extents: find the victim alloc group, where the given #blk fits.
  Ocfs2/move_extents: defrag a range of extent.
  Ocfs2/move_extents: move a range of extent.
  Ocfs2/move_extents: lock allocators and reserve metadata blocks and data clusters for extents moving.
  Ocfs2/move_extents: Add basic framework and source files for extent moving.
  Ocfs2/move_extents: Adding new ioctl code 'OCFS2_IOC_MOVE_EXT' to ocfs2.
  Ocfs2/refcounttree: Publicize couple of funcs from refcounttree.c
  Ocfs2: Add a new code 'OCFS2_INFO_FREEFRAG' for o2info ioctl.
  Ocfs2: Add a new code 'OCFS2_INFO_FREEINODE' for o2info ioctl.
  ...
parents f8d613e2 ece928df
What: /sys/o2cb symlink
Date: Dec 2005
KernelVersion: 2.6.16
Date: May 2011
KernelVersion: 2.6.40
Contact: ocfs2-devel@oss.oracle.com
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
be removed when new versions of ocfs2-tools which know to look
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
removed when new versions of ocfs2-tools which know to look
in /sys/fs/o2cb are sufficiently prevalent. Don't code new
software to look here, it should try /sys/fs/o2cb instead.
See Documentation/ABI/stable/o2cb for more information on usage.
Users: ocfs2-tools. It's sufficient to mail proposed changes to
ocfs2-devel@oss.oracle.com.
......@@ -262,16 +262,6 @@ Who: Michael Buesch <mb@bu3sch.de>
---------------------------
What: /sys/o2cb symlink
When: January 2010
Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
exists as a symlink for backwards compatibility for old versions of
ocfs2-tools. 2 years should be sufficient time to phase in new versions
which know to look in /sys/fs/o2cb.
Who: ocfs2-devel@oss.oracle.com
---------------------------
What: Ability for non root users to shm_get hugetlb pages based on mlock
resource limits
When: 2.6.31
......
......@@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.
noatime Do not update access time.
relatime(*) Update atime if the previous atime is older than
mtime or ctime
strictatime Always update atime, but the minimum update interval
is specified by atime_quantum.
atime_quantum=60(*) OCFS2 will not update atime unless this number
of seconds has passed since the last update.
Set to zero to always update atime.
Set to zero to always update atime. This option need
work with strictatime.
data=ordered (*) All data are forced directly out to the main file
system prior to its metadata being committed to the
journal.
......
......@@ -30,6 +30,7 @@ ocfs2-objs := \
namei.o \
refcounttree.o \
reservations.o \
move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
......
......@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <cluster/masklog.h>
......@@ -7184,3 +7185,168 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
out:
return ret;
}
static int ocfs2_trim_extent(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 count)
{
u64 discard, bcount;
bcount = ocfs2_clusters_to_blocks(sb, count);
discard = le64_to_cpu(gd->bg_blkno) +
ocfs2_clusters_to_blocks(sb, start);
trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
}
static int ocfs2_trim_group(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 max, u32 minbits)
{
int ret = 0, count = 0, next;
void *bitmap = gd->bg_bitmap;
if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
return 0;
trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
start, max, minbits);
while (start < max) {
start = ocfs2_find_next_zero_bit(bitmap, max, start);
if (start >= max)
break;
next = ocfs2_find_next_bit(bitmap, max, start);
if ((next - start) >= minbits) {
ret = ocfs2_trim_extent(sb, gd,
start, next - start);
if (ret < 0) {
mlog_errno(ret);
break;
}
count += next - start;
}
start = next + 1;
if (fatal_signal_pending(current)) {
count = -ERESTARTSYS;
break;
}
if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
break;
}
if (ret < 0)
count = ret;
return count;
}
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
u64 start, len, trimmed, first_group, last_group, group;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
trimmed = 0;
if (!len) {
range->len = 0;
return 0;
}
if (minlen >= osb->bitmap_cpg)
return -EINVAL;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EIO;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (start >= le32_to_cpu(main_bm->i_clusters)) {
ret = -EINVAL;
goto out_unlock;
}
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
trace_ocfs2_trim_fs(start, len, minlen);
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
first_bit = start;
else
first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
last_bit = osb->bitmap_cpg;
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
last_bit = first_bit + len;
ret = ocfs2_read_group_descriptor(main_bm_inode,
main_bm, group,
&gd_bh);
if (ret < 0) {
mlog_errno(ret);
break;
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
brelse(gd_bh);
gd_bh = NULL;
if (cnt < 0) {
ret = cnt;
mlog_errno(ret);
break;
}
trimmed += cnt;
len -= osb->bitmap_cpg - first_bit;
first_bit = 0;
if (group == osb->first_cluster_group_blkno)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
}
range->len = trimmed * sb->s_blocksize;
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
return ret;
}
......@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
......
......@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
......@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
/*
* Create this symlink for backwards compatibility with old
* versions of ocfs2-tools which look for things in /sys/o2cb.
*/
ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
if (ret)
goto error;
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
......
......@@ -144,6 +144,7 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
......@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
{
if (idx == DLM_GRANTED_LIST)
return "granted";
else if (idx == DLM_CONVERTING_LIST)
return "converting";
else if (idx == DLM_BLOCKED_LIST)
return "blocked";
else
return "unknown";
}
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
......@@ -448,6 +461,7 @@ enum {
DLM_FINALIZE_RECO_MSG = 518,
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data
......
......@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Exit Domain Map: xx xx xx */
out += snprintf(buf + out, len - out, "Exit Domain Map: ");
out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
......
......@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* New in version 1.1:
* - Message DLM_QUERY_REGION added to support global heartbeat
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
.pv_minor = 1,
.pv_minor = 2,
};
#define DLM_DOMAIN_BACKOFF_MS 200
......@@ -449,15 +451,19 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
if (dropped)
__dlm_lockres_calc_usage(dlm, res);
else
iter = res->hash_node.next;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
if (dropped)
if (dropped) {
cond_resched_lock(&dlm->spinlock);
goto redo_bucket;
}
}
cond_resched_lock(&dlm->spinlock);
num += n;
}
......@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
void *data, void **ret_data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
spin_lock(&dlm->spinlock);
set_bit(node, dlm->exit_domain_map);
spin_unlock(&dlm->spinlock);
dlm_put(dlm);
return 0;
}
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
......@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
......@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
node, dlm->name, dlm->node_num);
mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
&leave_msg, sizeof(leave_msg), node,
NULL);
status = o2net_send_message(msg_type, dlm->key, &leave_msg,
sizeof(leave_msg), node, NULL);
if (status < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
mlog(0, "status return %d from o2net_send_message\n", status);
mlog(ML_ERROR, "Error %d sending domain exit message %u "
"to node %u on domain %s\n", status, msg_type, node,
dlm->name);
return status;
}
static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
{
int node = -1;
/* Support for begin exit domain was added in 1.2 */
if (dlm->dlm_locking_proto.pv_major == 1 &&
dlm->dlm_locking_proto.pv_minor < 2)
return;
/*
* Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
* informational. Meaning if a node does not receive the message,
* so be it.
*/
spin_lock(&dlm->spinlock);
while (1) {
node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
if (node >= O2NM_MAX_NODES)
break;
if (node == dlm->node_num)
continue;
spin_unlock(&dlm->spinlock);
dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
spin_lock(&dlm->spinlock);
}
spin_unlock(&dlm->spinlock);
}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
......@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
status = dlm_send_one_domain_exit(dlm, node);
status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
......@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
......@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
set_bit(assert->node_idx, dlm->domain_map);
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
......@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
......
......@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
dlm_lockres_put(res);
}
/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
* if not. If 0, numlocks is set to the number of locks in the lockres.
/*
* A migrateable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
int *numlocks,
int *hasrefs)
struct dlm_lock_resource *res)
{
int ret;
int i;
int count = 0;
enum dlm_lockres_list idx;
int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
u64 cookie;
assert_spin_locked(&res->spinlock);
*numlocks = 0;
*hasrefs = 0;
ret = -EINVAL;
if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "cannot migrate lockres with unknown owner!\n");
goto leave;
}
if (res->owner != dlm->node_num) {
mlog(0, "cannot migrate lockres this node doesn't own!\n");
goto leave;
}
if (res->owner != dlm->node_num)
return 0;
ret = 0;
queue = &res->granted;
for (i = 0; i < 3; i++) {
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
++count;
if (lock->ml.node == dlm->node_num) {
mlog(0, "found a lock owned by this node still "
"on the %s queue! will not migrate this "
"lockres\n", (i == 0 ? "granted" :
(i == 1 ? "converting" :
"blocked")));
ret = -ENOTEMPTY;
goto leave;
if (lock->ml.node != dlm->node_num) {
nonlocal++;
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
dlm_get_lock_cookie_seq(cookie),
dlm_list_in_text(idx));
return 0;
}
queue++;
}
*numlocks = count;
count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (count < O2NM_MAX_NODES)
*hasrefs = 1;
if (!nonlocal) {
node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
res->lockname.name);
leave:
return ret;
return 1;
}
/*
......@@ -2406,8 +2396,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 target)
struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
......@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
int numlocks, hasrefs;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
BUG_ON(target == O2NM_MAX_NODES);
name = res->lockname.name;
namelen = res->lockname.len;
mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
/*
* ensure this lockres is a proper candidate for migration
*/
spin_lock(&res->spinlock);
ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
if (ret < 0) {
spin_unlock(&res->spinlock);
goto leave;
}
spin_unlock(&res->spinlock);
/* no work to do */
if (numlocks == 0 && !hasrefs)
goto leave;
/*
* preallocate up front
* if this fails, abort
*/
mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
target);
/* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
......@@ -2461,36 +2433,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
ret = 0;
/*
* find a node to migrate the lockres to
*/
spin_lock(&dlm->spinlock);
/* pick a new node */
if (!test_bit(target, dlm->domain_map) ||
target >= O2NM_MAX_NODES) {
target = dlm_pick_migration_target(dlm, res);
}
mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
namelen, name, target);
if (target >= O2NM_MAX_NODES ||
!test_bit(target, dlm->domain_map)) {
/* target chosen is not alive */
ret = -EINVAL;
}
if (ret) {
spin_unlock(&dlm->spinlock);
goto fail;
}
mlog(0, "continuing with target = %u\n", target);
/*
* clear any existing master requests and
* add the migration mle to the list
*/
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
......@@ -2531,6 +2478,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
dlm_put_mle(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
}
goto leave;
}
......@@ -2652,69 +2600,52 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
if (wake)
wake_up(&res->wq);
/* TODO: cleanup */
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
mlog(0, "returning %d\n", ret);
mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
name, target, ret);
return ret;
}
#define DLM_MIGRATION_RETRY_MS 100
/* Should be called only after beginning the domain leave process.
/*
* Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
* and there should be no local locks left on locally mastered resources.
*
* Called with the dlm spinlock held, may drop it to do migration, but
* will re-acquire before exit.
*
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
*/
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
int numlocks, hasrefs;
u8 target = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s:%.*s: this node is not master, "
"trying to free this but locks remain\n",
dlm->name, res->lockname.len, res->lockname.name);
}
if (dlm_is_lockres_migrateable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
goto leave;
}
/* No need to migrate a lockres having no locks */
ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
if (ret >= 0 && numlocks == 0 && !hasrefs) {
spin_unlock(&res->spinlock);
if (target == O2NM_MAX_NODES)
goto leave;
}
spin_unlock(&res->spinlock);
/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
spin_unlock(&dlm->spinlock);
lock_dropped = 1;
while (1) {
ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
if (ret >= 0)
break;
if (ret == -ENOTEMPTY) {
mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
res->lockname.len, res->lockname.name);
BUG();
}
mlog(0, "lockres %.*s: migrate failed, "
"retrying\n", res->lockname.len,
res->lockname.name);
msleep(DLM_MIGRATION_RETRY_MS);
}
ret = dlm_migrate_lockres(dlm, res, target);
if (ret)
mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
dlm->name, res->lockname.len, res->lockname.name,
target, ret);
spin_lock(&dlm->spinlock);
leave:
return lock_dropped;
......@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
}
}
/* for now this is not too intelligent. we will
* need stats to make this do the right thing.
* this just finds the first lock on one of the
* queues and uses that node as the target. */
/*
* Pick a node to migrate the lock resource to. This function selects a
* potential target based first on the locks and then on refmap. It skips
* nodes that are in the process of exiting the domain.
*/
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int i;
enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
struct dlm_lock *lock;
int nodenum;
int noderef;
u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
spin_lock(&res->spinlock);
for (i=0; i<3; i++) {
/* Go through all the locks */
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
/* up to the caller to make sure this node
* is alive */
if (lock->ml.node != dlm->node_num) {
spin_unlock(&res->spinlock);
return lock->ml.node;
}
}
queue++;
if (lock->ml.node == dlm->node_num)
continue;
if (test_bit(lock->ml.node, dlm->exit_domain_map))
continue;
nodenum = lock->ml.node;
goto bail;
}
nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (nodenum < O2NM_MAX_NODES) {
spin_unlock(&res->spinlock);
return nodenum;
}
spin_unlock(&res->spinlock);
mlog(0, "have not found a suitable target yet! checking domain map\n");
/* ok now we're getting desperate. pick anyone alive. */
nodenum = -1;
/* Go thru the refmap */
noderef = -1;
while (1) {
nodenum = find_next_bit(dlm->domain_map,
O2NM_MAX_NODES, nodenum+1);
mlog(0, "found %d in domain map\n", nodenum);
if (nodenum >= O2NM_MAX_NODES)
noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
noderef + 1);
if (noderef >= O2NM_MAX_NODES)
break;
if (nodenum != dlm->node_num) {
mlog(0, "picking %d\n", nodenum);
return nodenum;
}
if (noderef == dlm->node_num)
continue;
if (test_bit(noderef, dlm->exit_domain_map))
continue;
nodenum = noderef;
goto bail;
}
mlog(0, "giving up. no master to migrate to\n");
return DLM_LOCK_RES_OWNER_UNKNOWN;
bail:
return nodenum;
}
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
......
......@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
mlog(0, "node %u being removed from domain map!\n", idx);
clear_bit(idx, dlm->domain_map);
clear_bit(idx, dlm->exit_domain_map);
/* wake up migration waiters if a node goes down.
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
......
......@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
* signifies a bast fired on the lock.
*/
#define DLMFS_CAPABILITIES "bast stackglue"
extern int param_set_dlmfs_capabilities(const char *val,
static int param_set_dlmfs_capabilities(const char *val,
struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);
......
......@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
.fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops_no_plocks = {
......
......@@ -22,6 +22,11 @@
#include "ioctl.h"
#include "resize.h"
#include "refcounttree.h"
#include "sysfile.h"
#include "dir.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "move_extents.h"
#include <linux/ext2_fs.h>
......@@ -35,31 +40,27 @@
* be -EFAULT. The error will be returned from the ioctl(2) call. It's
* just a best-effort to tell userspace that this request caused the error.
*/
static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
{
kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
}
#define o2info_set_request_error(a, b) \
__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags |= OCFS2_INFO_FL_FILLED;
}
#define o2info_set_request_filled(a) \
__o2info_set_request_filled((struct ocfs2_info_request *)&(a))
static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
}
#define o2info_clear_request_filled(a) \
__o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
static inline int o2info_coherent(struct ocfs2_info_request *req)
{
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
......@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
oib.ib_blocksize = inode->i_sb->s_blocksize;
o2info_set_request_filled(oib);
o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
goto bail;
......@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oib, req);
o2info_set_request_error(&oib.ib_req, req);
return status;
}
......@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
oic.ic_clustersize = osb->s_clustersize;
o2info_set_request_filled(oic);
o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
goto bail;
......@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oic, req);
o2info_set_request_error(&oic.ic_req, req);
return status;
}
......@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
oim.im_max_slots = osb->max_slots;
o2info_set_request_filled(oim);
o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
goto bail;
......@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oim, req);
o2info_set_request_error(&oim.im_req, req);
return status;
}
......@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
o2info_set_request_filled(oil);
o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
goto bail;
......@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oil, req);
o2info_set_request_error(&oil.il_req, req);
return status;
}
......@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
o2info_set_request_filled(oiu);
o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
goto bail;
......@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oiu, req);
o2info_set_request_error(&oiu.iu_req, req);
return status;
}
......@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
oif.if_incompat_features = osb->s_feature_incompat;
oif.if_ro_compat_features = osb->s_feature_ro_compat;
o2info_set_request_filled(oif);
o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
goto bail;
......@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oif, req);
o2info_set_request_error(&oif.if_req, req);
return status;
}
......@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
oij.ij_journal_size = osb->journal->j_inode->i_size;
o2info_set_request_filled(oij);
o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
goto bail;
......@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oij, req);
o2info_set_request_error(&oij.ij_req, req);
return status;
}
int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
struct inode *inode_alloc, u64 blkno,
struct ocfs2_info_freeinode *fi, u32 slot)
{
int status = 0, unlock = 0;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *dinode_alloc = NULL;
if (inode_alloc)
mutex_lock(&inode_alloc->i_mutex);
if (o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
}
unlock = 1;
} else {
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
fi->ifi_stat[slot].lfi_total =
le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
fi->ifi_stat[slot].lfi_free =
le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
bail:
if (unlock)
ocfs2_inode_unlock(inode_alloc, 0);
if (inode_alloc)
mutex_unlock(&inode_alloc->i_mutex);
brelse(bh);
return status;
}
int ocfs2_info_handle_freeinode(struct inode *inode,
struct ocfs2_info_request __user *req)
{
u32 i;
u64 blkno = -1;
char namebuf[40];
int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
struct ocfs2_info_freeinode *oifi = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *inode_alloc = NULL;
oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
if (!oifi) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
if (o2info_from_user(*oifi, req))
goto bail;
oifi->ifi_slotnum = osb->max_slots;
for (i = 0; i < oifi->ifi_slotnum; i++) {
if (o2info_coherent(&oifi->ifi_req)) {
inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
if (!inode_alloc) {
mlog(ML_ERROR, "unable to get alloc inode in "
"slot %u\n", i);
status = -EIO;
goto bail;
}
} else {
ocfs2_sprintf_system_inode_name(namebuf,
sizeof(namebuf),
type, i);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
namebuf,
strlen(namebuf),
&blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
}
}
status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
if (status < 0)
goto bail;
iput(inode_alloc);
inode_alloc = NULL;
}
o2info_set_request_filled(&oifi->ifi_req);
if (o2info_to_user(*oifi, req))
goto bail;
status = 0;
bail:
if (status)
o2info_set_request_error(&oifi->ifi_req, req);
kfree(oifi);
return status;
}
static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
unsigned int chunksize)
{
int index;
index = __ilog2_u32(chunksize);
if (index >= OCFS2_INFO_MAX_HIST)
index = OCFS2_INFO_MAX_HIST - 1;
hist->fc_chunks[index]++;
hist->fc_clusters[index] += chunksize;
}
static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
unsigned int chunksize)
{
if (chunksize > stats->ffs_max)
stats->ffs_max = chunksize;
if (chunksize < stats->ffs_min)
stats->ffs_min = chunksize;
stats->ffs_avg += chunksize;
stats->ffs_free_chunks_real++;
}
void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
unsigned int chunksize)
{
o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
}
int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
struct inode *gb_inode,
struct ocfs2_dinode *gb_dinode,
struct ocfs2_chain_rec *rec,
struct ocfs2_info_freefrag *ffg,
u32 chunks_in_group)
{
int status = 0, used;
u64 blkno;
struct buffer_head *bh = NULL;
struct ocfs2_group_desc *bg = NULL;
unsigned int max_bits, num_clusters;
unsigned int offset = 0, cluster, chunk;
unsigned int chunk_free, last_chunksize = 0;
if (!le32_to_cpu(rec->c_free))
goto bail;
do {
if (!bg)
blkno = le64_to_cpu(rec->c_blkno);
else
blkno = le64_to_cpu(bg->bg_next_group);
if (bh) {
brelse(bh);
bh = NULL;
}
if (o2info_coherent(&ffg->iff_req))
status = ocfs2_read_group_descriptor(gb_inode,
gb_dinode,
blkno, &bh);
else
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # "
"%llu from device.", (unsigned long long)blkno);
status = -EIO;
goto bail;
}
bg = (struct ocfs2_group_desc *)bh->b_data;
if (!le16_to_cpu(bg->bg_free_bits_count))
continue;
max_bits = le16_to_cpu(bg->bg_bits);
offset = 0;
for (chunk = 0; chunk < chunks_in_group; chunk++) {
/*
* last chunk may be not an entire one.
*/
if ((offset + ffg->iff_chunksize) > max_bits)
num_clusters = max_bits - offset;
else
num_clusters = ffg->iff_chunksize;
chunk_free = 0;
for (cluster = 0; cluster < num_clusters; cluster++) {
used = ocfs2_test_bit(offset,
(unsigned long *)bg->bg_bitmap);
/*
* - chunk_free counts free clusters in #N chunk.
* - last_chunksize records the size(in) clusters
* for the last real free chunk being counted.
*/
if (!used) {
last_chunksize++;
chunk_free++;
}
if (used && last_chunksize) {
ocfs2_info_update_ffg(ffg,
last_chunksize);
last_chunksize = 0;
}
offset++;
}
if (chunk_free == ffg->iff_chunksize)
ffg->iff_ffs.ffs_free_chunks++;
}
/*
* need to update the info for last free chunk.
*/
if (last_chunksize)
ocfs2_info_update_ffg(ffg, last_chunksize);
} while (le64_to_cpu(bg->bg_next_group));
bail:
brelse(bh);
return status;
}
int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
struct inode *gb_inode, u64 blkno,
struct ocfs2_info_freefrag *ffg)
{
u32 chunks_in_group;
int status = 0, unlock = 0, i;
struct buffer_head *bh = NULL;
struct ocfs2_chain_list *cl = NULL;
struct ocfs2_chain_rec *rec = NULL;
struct ocfs2_dinode *gb_dinode = NULL;
if (gb_inode)
mutex_lock(&gb_inode->i_mutex);
if (o2info_coherent(&ffg->iff_req)) {
status = ocfs2_inode_lock(gb_inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
}
unlock = 1;
} else {
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
gb_dinode = (struct ocfs2_dinode *)bh->b_data;
cl = &(gb_dinode->id2.i_chain);
/*
* Chunksize(in) clusters from userspace should be
* less than clusters in a group.
*/
if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
status = -EINVAL;
goto bail;
}
memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
ffg->iff_ffs.ffs_min = ~0U;
ffg->iff_ffs.ffs_clusters =
le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
rec = &(cl->cl_recs[i]);
status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
gb_dinode,
rec, ffg,
chunks_in_group);
if (status)
goto bail;
}
if (ffg->iff_ffs.ffs_free_chunks_real)
ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
ffg->iff_ffs.ffs_free_chunks_real);
bail:
if (unlock)
ocfs2_inode_unlock(gb_inode, 0);
if (gb_inode)
mutex_unlock(&gb_inode->i_mutex);
if (gb_inode)
iput(gb_inode);
brelse(bh);
return status;
}
int ocfs2_info_handle_freefrag(struct inode *inode,
struct ocfs2_info_request __user *req)
{
u64 blkno = -1;
char namebuf[40];
int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
struct ocfs2_info_freefrag *oiff;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *gb_inode = NULL;
oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
if (!oiff) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
if (o2info_from_user(*oiff, req))
goto bail;
/*
* chunksize from userspace should be power of 2.
*/
if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
(!oiff->iff_chunksize)) {
status = -EINVAL;
goto bail;
}
if (o2info_coherent(&oiff->iff_req)) {
gb_inode = ocfs2_get_system_file_inode(osb, type,
OCFS2_INVALID_SLOT);
if (!gb_inode) {
mlog(ML_ERROR, "unable to get global_bitmap inode\n");
status = -EIO;
goto bail;
}
} else {
ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
OCFS2_INVALID_SLOT);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
namebuf,
strlen(namebuf),
&blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
}
}
status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
if (status < 0)
goto bail;
o2info_set_request_filled(&oiff->iff_req);
if (o2info_to_user(*oiff, req))
goto bail;
status = 0;
bail:
if (status)
o2info_set_request_error(&oiff->iff_req, req);
kfree(oiff);
return status;
}
......@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
if (o2info_from_user(oir, req))
goto bail;
o2info_clear_request_filled(oir);
o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
goto bail;
......@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oir, req);
o2info_set_request_error(&oir, req);
return status;
}
......@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
status = ocfs2_info_handle_journal_size(inode, req);
break;
case OCFS2_INFO_FREEINODE:
if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
status = ocfs2_info_handle_freeinode(inode, req);
break;
case OCFS2_INFO_FREEFRAG:
if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
status = ocfs2_info_handle_freefrag(inode, req);
break;
default:
status = ocfs2_info_handle_unknown(inode, req);
break;
......@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
case FITRIM:
{
struct super_block *sb = inode->i_sb;
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&range, (struct fstrim_range *)arg,
sizeof(range)))
return -EFAULT;
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
if (copy_to_user((struct fstrim_range *)arg, &range,
sizeof(range)))
return -EFAULT;
return 0;
}
case OCFS2_IOC_MOVE_EXT:
return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
default:
return -ENOTTY;
}
......@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, (struct reflink_arguments *)arg,
......@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
case OCFS2_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
}
......
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* move_extents.c
*
* Copyright (C) 2011 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/mount.h>
#include <linux/swap.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "ocfs2_ioctl.h"
#include "alloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "journal.h"
#include "suballoc.h"
#include "uptodate.h"
#include "super.h"
#include "dir.h"
#include "buffer_head_io.h"
#include "sysfile.h"
#include "suballoc.h"
#include "refcounttree.h"
#include "move_extents.h"
struct ocfs2_move_extents_context {
struct inode *inode;
struct file *file;
int auto_defrag;
int partial;
int credits;
u32 new_phys_cpos;
u32 clusters_moved;
u64 refcount_loc;
struct ocfs2_move_extents *range;
struct ocfs2_extent_tree et;
struct ocfs2_alloc_context *meta_ac;
struct ocfs2_alloc_context *data_ac;
struct ocfs2_cached_dealloc_ctxt dealloc;
};
static int __ocfs2_move_extent(handle_t *handle,
struct ocfs2_move_extents_context *context,
u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
int ext_flags)
{
int ret = 0, index;
struct inode *inode = context->inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_rec *rec, replace_rec;
struct ocfs2_path *path = NULL;
struct ocfs2_extent_list *el;
u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
p_cpos, new_p_cpos, len);
if (ret) {
mlog_errno(ret);
goto out;
}
memset(&replace_rec, 0, sizeof(replace_rec));
replace_rec.e_cpos = cpu_to_le32(cpos);
replace_rec.e_leaf_clusters = cpu_to_le16(len);
replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
new_p_cpos));
path = ocfs2_new_path_from_et(&context->et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
ocfs2_error(inode->i_sb,
"Inode %llu has an extent at cpos %u which can no "
"longer be found.\n",
(unsigned long long)ino, cpos);
ret = -EROFS;
goto out;
}
rec = &el->l_recs[index];
BUG_ON(ext_flags != rec->e_flags);
/*
* after moving/defraging to new location, the extent is not going
* to be refcounted anymore.
*/
replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
context->et.et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_split_extent(handle, &context->et, path, index,
&replace_rec, context->meta_ac,
&context->dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2_journal_dirty(handle, context->et.et_root_bh);
context->new_phys_cpos = new_p_cpos;
/*
* need I to append truncate log for old clusters?
*/
if (old_blkno) {
if (ext_flags & OCFS2_EXT_REFCOUNTED)
ret = ocfs2_decrease_refcount(inode, handle,
ocfs2_blocks_to_clusters(osb->sb,
old_blkno),
len, context->meta_ac,
&context->dealloc, 1);
else
ret = ocfs2_truncate_log_append(osb, handle,
old_blkno, len);
}
out:
return ret;
}
/*
* lock allocators, and reserving appropriate number of bits for
* meta blocks and data clusters.
*
* in some cases, we don't need to reserve clusters, just let data_ac
* be NULL.
*/
static int ocfs2_lock_allocators_move_extents(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters_to_move,
u32 extents_to_split,
struct ocfs2_alloc_context **meta_ac,
struct ocfs2_alloc_context **data_ac,
int extra_blocks,
int *credits)
{
int ret, num_free_extents;
unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
num_free_extents = ocfs2_num_free_extents(osb, et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
goto out;
}
if (!num_free_extents ||
(ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
if (data_ac) {
ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
}
*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
clusters_to_move + 2);
mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
extra_blocks, clusters_to_move, *credits);
out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
*meta_ac = NULL;
}
}
return ret;
}
/*
* Using one journal handle to guarantee the data consistency in case
* crash happens anywhere.
*
* XXX: defrag can end up with finishing partial extent as requested,
* due to not enough contiguous clusters can be found in allocator.
*/
static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
{
int ret, credits = 0, extra_blocks = 0, partial = context->partial;
handle_t *handle;
struct inode *inode = context->inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 new_phys_cpos, new_len;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
OCFS2_HAS_REFCOUNT_FL));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
&ref_tree, NULL);
if (ret) {
mlog_errno(ret);
return ret;
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
context->refcount_loc,
phys_blkno,
*len,
&credits,
&extra_blocks);
if (ret) {
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
&context->meta_ac,
&context->data_ac,
extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* should be using allocation reservation strategy there?
*
* if (context->data_ac)
* context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
*/
mutex_lock(&tl_inode->i_mutex);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock_mutex;
}
}
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_unlock_mutex;
}
ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
&new_phys_cpos, &new_len);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
/*
* allowing partial extent moving is kind of 'pros and cons', it makes
* whole defragmentation less likely to fail, on the contrary, the bad
* thing is it may make the fs even more fragmented after moving, let
* userspace make a good decision here.
*/
if (new_len != *len) {
mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
if (!partial) {
context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
ret = -ENOSPC;
goto out_commit;
}
}
mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
phys_cpos, new_phys_cpos);
ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
new_phys_cpos, ext_flags);
if (ret)
mlog_errno(ret);
if (partial && (new_len != *len))
*len = new_len;
/*
* Here we should write the new page out first if we are
* in write-back mode.
*/
ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
if (ret)
mlog_errno(ret);
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
mutex_unlock(&tl_inode->i_mutex);
if (context->data_ac) {
ocfs2_free_alloc_context(context->data_ac);
context->data_ac = NULL;
}
if (context->meta_ac) {
ocfs2_free_alloc_context(context->meta_ac);
context->meta_ac = NULL;
}
out:
if (ref_tree)
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
return ret;
}
/*
* find the victim alloc group, where #blkno fits.
*/
static int ocfs2_find_victim_alloc_group(struct inode *inode,
u64 vict_blkno,
int type, int slot,
int *vict_bit,
struct buffer_head **ret_bh)
{
int ret, i, blocks_per_unit = 1;
u64 blkno;
char namebuf[40];
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *rec;
struct ocfs2_dinode *ac_dinode;
struct ocfs2_group_desc *bg;
ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
strlen(namebuf), &blkno);
if (ret) {
ret = -ENOENT;
goto out;
}
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
cl = &(ac_dinode->id2.i_chain);
rec = &(cl->cl_recs[0]);
if (type == GLOBAL_BITMAP_SYSTEM_INODE)
blocks_per_unit <<= (osb->s_clustersize_bits -
inode->i_sb->s_blocksize_bits);
/*
* 'vict_blkno' was out of the valid range.
*/
if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
(vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
blocks_per_unit))) {
ret = -EINVAL;
goto out;
}
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
rec = &(cl->cl_recs[i]);
if (!rec)
continue;
bg = NULL;
do {
if (!bg)
blkno = le64_to_cpu(rec->c_blkno);
else
blkno = le64_to_cpu(bg->bg_next_group);
if (gd_bh) {
brelse(gd_bh);
gd_bh = NULL;
}
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
le16_to_cpu(bg->bg_bits))) {
*ret_bh = gd_bh;
*vict_bit = (vict_blkno - blkno) /
blocks_per_unit;
mlog(0, "find the victim group: #%llu, "
"total_bits: %u, vict_bit: %u\n",
blkno, le16_to_cpu(bg->bg_bits),
*vict_bit);
goto out;
}
} while (le64_to_cpu(bg->bg_next_group));
}
ret = -EINVAL;
out:
brelse(ac_bh);
/*
* caller has to release the gd_bh properly.
*/
return ret;
}
/*
* XXX: helper to validate and adjust moving goal.
*/
static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
struct ocfs2_move_extents *range)
{
int ret, goal_bit = 0;
struct buffer_head *gd_bh = NULL;
struct ocfs2_group_desc *bg;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int c_to_b = 1 << (osb->s_clustersize_bits -
inode->i_sb->s_blocksize_bits);
/*
* validate goal sits within global_bitmap, and return the victim
* group desc
*/
ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT,
&goal_bit, &gd_bh);
if (ret)
goto out;
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
/*
* make goal become cluster aligned.
*/
if (range->me_goal % c_to_b)
range->me_goal = range->me_goal / c_to_b * c_to_b;
/*
* moving goal is not allowd to start with a group desc blok(#0 blk)
* let's compromise to the latter cluster.
*/
if (range->me_goal == le64_to_cpu(bg->bg_blkno))
range->me_goal += c_to_b;
/*
* movement is not gonna cross two groups.
*/
if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
range->me_len) {
ret = -EINVAL;
goto out;
}
/*
* more exact validations/adjustments will be performed later during
* moving operation for each extent range.
*/
mlog(0, "extents get ready to be moved to #%llu block\n",
range->me_goal);
out:
brelse(gd_bh);
return ret;
}
static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
int *goal_bit, u32 move_len, u32 max_hop,
u32 *phys_cpos)
{
int i, used, last_free_bits = 0, base_bit = *goal_bit;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(gd->bg_blkno));
for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
if (used) {
/*
* we even tried searching the free chunk by jumping
* a 'max_hop' distance, but still failed.
*/
if ((i - base_bit) > max_hop) {
*phys_cpos = 0;
break;
}
if (last_free_bits)
last_free_bits = 0;
continue;
} else
last_free_bits++;
if (last_free_bits == move_len) {
*goal_bit = i;
*phys_cpos = base_cpos + i;
break;
}
}
mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
}
static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
u16 chain)
{
int ret;
u32 tmp_used;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl =
(struct ocfs2_chain_list *) &di->id2.i_chain;
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
ocfs2_journal_dirty(handle, di_bh);
out:
return ret;
}
static inline int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
unsigned int bit_off,
unsigned int num_bits)
{
int status;
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
/* All callers get the descriptor via
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
status = ocfs2_journal_access_gd(handle,
INODE_CACHE(alloc_inode),
group_bh,
journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
" count %u but claims %u are freed. num_bits %d",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
le16_to_cpu(bg->bg_free_bits_count), num_bits);
return -EROFS;
}
while (num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
ocfs2_journal_dirty(handle, group_bh);
bail:
return status;
}
static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
u32 len, int ext_flags)
{
int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
handle_t *handle;
struct inode *inode = context->inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
struct inode *gb_inode = NULL;
struct buffer_head *gb_bh = NULL;
struct buffer_head *gd_bh = NULL;
struct ocfs2_group_desc *gd;
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
context->range->me_threshold);
u64 phys_blkno, new_phys_blkno;
phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
OCFS2_HAS_REFCOUNT_FL));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
&ref_tree, NULL);
if (ret) {
mlog_errno(ret);
return ret;
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
context->refcount_loc,
phys_blkno,
len,
&credits,
&extra_blocks);
if (ret) {
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
&context->meta_ac,
NULL, extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* need to count 2 extra credits for global_bitmap inode and
* group descriptor.
*/
credits += OCFS2_INODE_UPDATE_CREDITS + 1;
/*
* ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
* logic, while we still need to lock the global_bitmap.
*/
gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!gb_inode) {
mlog(ML_ERROR, "unable to get global_bitmap inode\n");
ret = -EIO;
goto out;
}
mutex_lock(&gb_inode->i_mutex);
ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_unlock_gb_mutex;
}
mutex_lock(&tl_inode->i_mutex);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_unlock_tl_inode;
}
new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT,
&goal_bit, &gd_bh);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
/*
* probe the victim cluster group to find a proper
* region to fit wanted movement, it even will perfrom
* a best-effort attempt by compromising to a threshold
* around the goal.
*/
ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
new_phys_cpos);
if (!new_phys_cpos) {
ret = -ENOSPC;
goto out_commit;
}
ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
*new_phys_cpos, ext_flags);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
le16_to_cpu(gd->bg_chain));
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
goal_bit, len);
if (ret)
mlog_errno(ret);
/*
* Here we should write the new page out first if we are
* in write-back mode.
*/
ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
if (ret)
mlog_errno(ret);
out_commit:
ocfs2_commit_trans(osb, handle);
brelse(gd_bh);
out_unlock_tl_inode:
mutex_unlock(&tl_inode->i_mutex);
ocfs2_inode_unlock(gb_inode, 1);
out_unlock_gb_mutex:
mutex_unlock(&gb_inode->i_mutex);
brelse(gb_bh);
iput(gb_inode);
out:
if (context->meta_ac) {
ocfs2_free_alloc_context(context->meta_ac);
context->meta_ac = NULL;
}
if (ref_tree)
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
return ret;
}
/*
* Helper to calculate the defraging length in one run according to threshold.
*/
static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
u32 threshold, int *skip)
{
if ((*alloc_size + *len_defraged) < threshold) {
/*
* proceed defragmentation until we meet the thresh
*/
*len_defraged += *alloc_size;
} else if (*len_defraged == 0) {
/*
* XXX: skip a large extent.
*/
*skip = 1;
} else {
/*
* split this extent to coalesce with former pieces as
* to reach the threshold.
*
* we're done here with one cycle of defragmentation
* in a size of 'thresh', resetting 'len_defraged'
* forces a new defragmentation.
*/
*alloc_size = threshold - *len_defraged;
*len_defraged = 0;
}
}
static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
struct ocfs2_move_extents_context *context)
{
int ret = 0, flags, do_defrag, skip = 0;
u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
struct inode *inode = context->inode;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_move_extents *range = context->range;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if ((inode->i_size == 0) || (range->me_len == 0))
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&context->dealloc);
/*
* TO-DO XXX:
*
* - xattr extents.
*/
do_defrag = context->auto_defrag;
/*
* extents moving happens in unit of clusters, for the sake
* of simplicity, we may ignore two clusters where 'byte_start'
* and 'byte_start + len' were within.
*/
move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
len_to_move = (range->me_start + range->me_len) >>
osb->s_clustersize_bits;
if (len_to_move >= move_start)
len_to_move -= move_start;
else
len_to_move = 0;
if (do_defrag) {
defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
if (defrag_thresh <= 1)
goto done;
} else
new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
range->me_goal);
mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
"thresh: %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)range->me_start,
(unsigned long long)range->me_len,
move_start, len_to_move, defrag_thresh);
cpos = move_start;
while (len_to_move) {
ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
&flags);
if (ret) {
mlog_errno(ret);
goto out;
}
if (alloc_size > len_to_move)
alloc_size = len_to_move;
/*
* XXX: how to deal with a hole:
*
* - skip the hole of course
* - force a new defragmentation
*/
if (!phys_cpos) {
if (do_defrag)
len_defraged = 0;
goto next;
}
if (do_defrag) {
ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
defrag_thresh, &skip);
/*
* skip large extents
*/
if (skip) {
skip = 0;
goto next;
}
mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
"alloc_size: %u, len_defraged: %u\n",
cpos, phys_cpos, alloc_size, len_defraged);
ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
&alloc_size, flags);
} else {
ret = ocfs2_move_extent(context, cpos, phys_cpos,
&new_phys_cpos, alloc_size,
flags);
new_phys_cpos += alloc_size;
}
if (ret < 0) {
mlog_errno(ret);
goto out;
}
context->clusters_moved += alloc_size;
next:
cpos += alloc_size;
len_to_move -= alloc_size;
}
done:
range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
out:
range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
context->clusters_moved);
range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
context->new_phys_cpos);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &context->dealloc);
return ret;
}
static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
{
int status;
handle_t *handle;
struct inode *inode = context->inode;
struct ocfs2_dinode *di;
struct buffer_head *di_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!inode)
return -ENOENT;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
mutex_lock(&inode->i_mutex);
/*
* This prevents concurrent writes from other nodes
*/
status = ocfs2_rw_lock(inode, 1);
if (status) {
mlog_errno(status);
goto out;
}
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status) {
mlog_errno(status);
goto out_rw_unlock;
}
/*
* rememer ip_xattr_sem also needs to be held if necessary
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
status = __ocfs2_move_extents_range(di_bh, context);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (status) {
mlog_errno(status);
goto out_inode_unlock;
}
/*
* We update ctime for these changes
*/
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_inode_unlock;
}
status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status) {
mlog_errno(status);
goto out_commit;
}
di = (struct ocfs2_dinode *)di_bh->b_data;
inode->i_ctime = CURRENT_TIME;
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
brelse(di_bh);
ocfs2_inode_unlock(inode, 1);
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
mutex_unlock(&inode->i_mutex);
return status;
}
int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
{
int status;
struct inode *inode = filp->f_path.dentry->d_inode;
struct ocfs2_move_extents range;
struct ocfs2_move_extents_context *context = NULL;
status = mnt_want_write(filp->f_path.mnt);
if (status)
return status;
if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
goto out;
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
status = -EPERM;
goto out;
}
context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
if (!context) {
status = -ENOMEM;
mlog_errno(status);
goto out;
}
context->inode = inode;
context->file = filp;
if (argp) {
if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
sizeof(range))) {
status = -EFAULT;
goto out;
}
} else {
status = -EINVAL;
goto out;
}
if (range.me_start > i_size_read(inode))
goto out;
if (range.me_start + range.me_len > i_size_read(inode))
range.me_len = i_size_read(inode) - range.me_start;
context->range = &range;
if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
context->auto_defrag = 1;
/*
* ok, the default theshold for the defragmentation
* is 1M, since our maximum clustersize was 1M also.
* any thought?
*/
if (!range.me_threshold)
range.me_threshold = 1024 * 1024;
if (range.me_threshold > i_size_read(inode))
range.me_threshold = i_size_read(inode);
if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
context->partial = 1;
} else {
/*
* first best-effort attempt to validate and adjust the goal
* (physical address in block), while it can't guarantee later
* operation can succeed all the time since global_bitmap may
* change a bit over time.
*/
status = ocfs2_validate_and_adjust_move_goal(inode, &range);
if (status)
goto out;
}
status = ocfs2_move_extents(context);
if (status)
mlog_errno(status);
out:
/*
* movement/defragmentation may end up being partially completed,
* that's the reason why we need to return userspace the finished
* length and new_offset even if failure happens somewhere.
*/
if (argp) {
if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
sizeof(range)))
status = -EFAULT;
}
kfree(context);
mnt_drop_write(filp->f_path.mnt);
return status;
}
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* move_extents.h
*
* Copyright (C) 2011 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_MOVE_EXTENTS_H
#define OCFS2_MOVE_EXTENTS_H
int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
#endif /* OCFS2_MOVE_EXTENTS_H */
......@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
__u64 ij_journal_size;
};
struct ocfs2_info_freeinode {
struct ocfs2_info_request ifi_req;
struct ocfs2_info_local_freeinode {
__u64 lfi_total;
__u64 lfi_free;
} ifi_stat[OCFS2_MAX_SLOTS];
__u32 ifi_slotnum; /* out */
__u32 ifi_pad;
};
#define OCFS2_INFO_MAX_HIST (32)
struct ocfs2_info_freefrag {
struct ocfs2_info_request iff_req;
struct ocfs2_info_freefrag_stats { /* (out) */
struct ocfs2_info_free_chunk_list {
__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
} ffs_fc_hist;
__u32 ffs_clusters;
__u32 ffs_free_clusters;
__u32 ffs_free_chunks;
__u32 ffs_free_chunks_real;
__u32 ffs_min; /* Minimum free chunksize in clusters */
__u32 ffs_max;
__u32 ffs_avg;
__u32 ffs_pad;
} iff_ffs;
__u32 iff_chunksize; /* chunksize in clusters(in) */
__u32 iff_pad;
};
/* Codes for ocfs2_info_request */
enum ocfs2_info_type {
OCFS2_INFO_CLUSTERSIZE = 1,
......@@ -151,6 +183,8 @@ enum ocfs2_info_type {
OCFS2_INFO_UUID,
OCFS2_INFO_FS_FEATURES,
OCFS2_INFO_JOURNAL_SIZE,
OCFS2_INFO_FREEINODE,
OCFS2_INFO_FREEFRAG,
OCFS2_INFO_NUM_TYPES
};
......@@ -171,4 +205,38 @@ enum ocfs2_info_type {
#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
struct ocfs2_move_extents {
/* All values are in bytes */
/* in */
__u64 me_start; /* Virtual start in the file to move */
__u64 me_len; /* Length of the extents to be moved */
__u64 me_goal; /* Physical offset of the goal,
it's in block unit */
__u64 me_threshold; /* Maximum distance from goal or threshold
for auto defragmentation */
__u64 me_flags; /* Flags for the operation:
* - auto defragmentation.
* - refcount,xattr cases.
*/
/* out */
__u64 me_moved_len; /* Moved/defraged length */
__u64 me_new_offset; /* Resulting physical location */
__u32 me_reserved[2]; /* Reserved for futhure */
};
#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
claim new clusters
as the goal place
for extents moving */
#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
moving, is to make
movement less likely
to fail, may make fs
even more fragmented */
#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
completely gets done.
*/
#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
#endif /* OCFS2_IOCTL_H */
......@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
__entry->blkno, __entry->bit)
);
TRACE_EVENT(ocfs2_trim_extent,
TP_PROTO(struct super_block *sb, unsigned long long blk,
unsigned long long count),
TP_ARGS(sb, blk, count),
TP_STRUCT__entry(
__field(int, dev_major)
__field(int, dev_minor)
__field(unsigned long long, blk)
__field(__u64, count)
),
TP_fast_assign(
__entry->dev_major = MAJOR(sb->s_dev);
__entry->dev_minor = MINOR(sb->s_dev);
__entry->blk = blk;
__entry->count = count;
),
TP_printk("%d %d %llu %llu",
__entry->dev_major, __entry->dev_minor,
__entry->blk, __entry->count)
);
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
/* Trace events for fs/ocfs2/localalloc.c. */
......
......@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
struct ocfs2_cow_context *context,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
......@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
return 0;
}
static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct ocfs2_cow_context *context,
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
struct ocfs2_caching_info *ci = context->data_et.et_ci;
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
struct address_space *mapping = context->inode->i_mapping;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
......@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
* We only duplicate pages until we reach the page contains i_size - 1.
* So trim 'end' to i_size.
*/
if (end > i_size_read(context->inode))
end = i_size_read(context->inode);
if (end > i_size_read(inode))
end = i_size_read(inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
......@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
if (PageReadahead(page) && context->file) {
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
&context->file->f_ra,
context->file,
&file->f_ra, file,
page, page_index,
readahead_pages);
}
......@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
ocfs2_map_and_dirty_page(context->inode,
handle, from, to,
ocfs2_map_and_dirty_page(inode, handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
unlock:
......@@ -3015,14 +3014,15 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
return ret;
}
static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct ocfs2_cow_context *context,
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0;
struct super_block *sb = context->inode->i_sb;
struct ocfs2_caching_info *ci = context->data_et.et_ci;
struct inode *inode = file->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
......@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
ret = context->cow_duplicate_clusters(handle, context, cpos,
old, new, len);
ret = context->cow_duplicate_clusters(handle, context->file,
cpos, old, new, len);
if (ret) {
mlog_errno(ret);
goto out;
......@@ -3162,8 +3162,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
return ret;
}
static int ocfs2_cow_sync_writeback(struct super_block *sb,
struct ocfs2_cow_context *context,
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
int ret = 0;
......@@ -3171,13 +3171,13 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
pgoff_t page_index;
struct page *page;
if (ocfs2_should_order_data(context->inode))
if (ocfs2_should_order_data(inode))
return 0;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
ret = filemap_fdatawrite_range(context->inode->i_mapping,
ret = filemap_fdatawrite_range(inode->i_mapping,
offset, end - 1);
if (ret < 0) {
mlog_errno(ret);
......@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
page = find_or_create_page(context->inode->i_mapping,
page = find_or_create_page(inode->i_mapping,
page_index, GFP_NOFS);
BUG_ON(!page);
......@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
* in write-back mode.
*/
if (context->get_clusters == ocfs2_di_get_clusters) {
ret = ocfs2_cow_sync_writeback(sb, context, cpos,
ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
orig_num_clusters);
if (ret)
mlog_errno(ret);
......
......@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters);
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,
......
......@@ -1567,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->preferred_slot != OCFS2_INVALID_SLOT)
seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment