Commit a74b81b0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2

* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (28 commits)
  Ocfs2: Teach local-mounted ocfs2 to handle unwritten_extents correctly.
  ocfs2/dlm: Do not migrate resource to a node that is leaving the domain
  ocfs2/dlm: Add new dlm message DLM_BEGIN_EXIT_DOMAIN_MSG
  Ocfs2/move_extents: Set several trivial constraints for threshold.
  Ocfs2/move_extents: Let defrag handle partial extent moving.
  Ocfs2/move_extents: move/defrag extents within a certain range.
  Ocfs2/move_extents: helper to calculate the defraging length in one run.
  Ocfs2/move_extents: move entire/partial extent.
  Ocfs2/move_extents: helpers to update the group descriptor and global bitmap inode.
  Ocfs2/move_extents: helper to probe a proper region to move in an alloc group.
  Ocfs2/move_extents: helper to validate and adjust moving goal.
  Ocfs2/move_extents: find the victim alloc group, where the given #blk fits.
  Ocfs2/move_extents: defrag a range of extent.
  Ocfs2/move_extents: move a range of extent.
  Ocfs2/move_extents: lock allocators and reserve metadata blocks and data clusters for extents moving.
  Ocfs2/move_extents: Add basic framework and source files for extent moving.
  Ocfs2/move_extents: Adding new ioctl code 'OCFS2_IOC_MOVE_EXT' to ocfs2.
  Ocfs2/refcounttree: Publicize couple of funcs from refcounttree.c
  Ocfs2: Add a new code 'OCFS2_INFO_FREEFRAG' for o2info ioctl.
  Ocfs2: Add a new code 'OCFS2_INFO_FREEINODE' for o2info ioctl.
  ...
parents f8d613e2 ece928df
What: /sys/o2cb symlink
Date: Dec 2005
KernelVersion: 2.6.16
Date: May 2011
KernelVersion: 2.6.40
Contact: ocfs2-devel@oss.oracle.com
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
be removed when new versions of ocfs2-tools which know to look
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
removed when new versions of ocfs2-tools which know to look
in /sys/fs/o2cb are sufficiently prevalent. Don't code new
software to look here, it should try /sys/fs/o2cb instead.
See Documentation/ABI/stable/o2cb for more information on usage.
Users: ocfs2-tools. It's sufficient to mail proposed changes to
ocfs2-devel@oss.oracle.com.
......@@ -262,16 +262,6 @@ Who: Michael Buesch <mb@bu3sch.de>
---------------------------
What: /sys/o2cb symlink
When: January 2010
Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
exists as a symlink for backwards compatibility for old versions of
ocfs2-tools. 2 years should be sufficient time to phase in new versions
which know to look in /sys/fs/o2cb.
Who: ocfs2-devel@oss.oracle.com
---------------------------
What: Ability for non root users to shm_get hugetlb pages based on mlock
resource limits
When: 2.6.31
......
......@@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.
noatime Do not update access time.
relatime(*) Update atime if the previous atime is older than
mtime or ctime
strictatime Always update atime, but the minimum update interval
is specified by atime_quantum.
atime_quantum=60(*) OCFS2 will not update atime unless this number
of seconds has passed since the last update.
Set to zero to always update atime.
Set to zero to always update atime. This option need
work with strictatime.
data=ordered (*) All data are forced directly out to the main file
system prior to its metadata being committed to the
journal.
......
......@@ -30,6 +30,7 @@ ocfs2-objs := \
namei.o \
refcounttree.o \
reservations.o \
move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
......
......@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <cluster/masklog.h>
......@@ -7184,3 +7185,168 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
out:
return ret;
}
static int ocfs2_trim_extent(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 count)
{
u64 discard, bcount;
bcount = ocfs2_clusters_to_blocks(sb, count);
discard = le64_to_cpu(gd->bg_blkno) +
ocfs2_clusters_to_blocks(sb, start);
trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
}
static int ocfs2_trim_group(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 max, u32 minbits)
{
int ret = 0, count = 0, next;
void *bitmap = gd->bg_bitmap;
if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
return 0;
trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
start, max, minbits);
while (start < max) {
start = ocfs2_find_next_zero_bit(bitmap, max, start);
if (start >= max)
break;
next = ocfs2_find_next_bit(bitmap, max, start);
if ((next - start) >= minbits) {
ret = ocfs2_trim_extent(sb, gd,
start, next - start);
if (ret < 0) {
mlog_errno(ret);
break;
}
count += next - start;
}
start = next + 1;
if (fatal_signal_pending(current)) {
count = -ERESTARTSYS;
break;
}
if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
break;
}
if (ret < 0)
count = ret;
return count;
}
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
u64 start, len, trimmed, first_group, last_group, group;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
trimmed = 0;
if (!len) {
range->len = 0;
return 0;
}
if (minlen >= osb->bitmap_cpg)
return -EINVAL;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EIO;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (start >= le32_to_cpu(main_bm->i_clusters)) {
ret = -EINVAL;
goto out_unlock;
}
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
trace_ocfs2_trim_fs(start, len, minlen);
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
first_bit = start;
else
first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
last_bit = osb->bitmap_cpg;
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
last_bit = first_bit + len;
ret = ocfs2_read_group_descriptor(main_bm_inode,
main_bm, group,
&gd_bh);
if (ret < 0) {
mlog_errno(ret);
break;
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
brelse(gd_bh);
gd_bh = NULL;
if (cnt < 0) {
ret = cnt;
mlog_errno(ret);
break;
}
trimmed += cnt;
len -= osb->bitmap_cpg - first_bit;
first_bit = 0;
if (group == osb->first_cluster_group_blkno)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
}
range->len = trimmed * sb->s_blocksize;
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
return ret;
}
......@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
......
......@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
......@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
/*
* Create this symlink for backwards compatibility with old
* versions of ocfs2-tools which look for things in /sys/o2cb.
*/
ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
if (ret)
goto error;
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
......
......@@ -144,6 +144,7 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
......@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
{
if (idx == DLM_GRANTED_LIST)
return "granted";
else if (idx == DLM_CONVERTING_LIST)
return "converting";
else if (idx == DLM_BLOCKED_LIST)
return "blocked";
else
return "unknown";
}
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
......@@ -448,6 +461,7 @@ enum {
DLM_FINALIZE_RECO_MSG = 518,
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data
......
......@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Exit Domain Map: xx xx xx */
out += snprintf(buf + out, len - out, "Exit Domain Map: ");
out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
......
......@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* New in version 1.1:
* - Message DLM_QUERY_REGION added to support global heartbeat
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
.pv_minor = 1,
.pv_minor = 2,
};
#define DLM_DOMAIN_BACKOFF_MS 200
......@@ -449,14 +451,18 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
__dlm_lockres_calc_usage(dlm, res);
iter = res->hash_node.next;
if (dropped)
__dlm_lockres_calc_usage(dlm, res);
else
iter = res->hash_node.next;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
if (dropped)
if (dropped) {
cond_resched_lock(&dlm->spinlock);
goto redo_bucket;
}
}
cond_resched_lock(&dlm->spinlock);
num += n;
......@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
void *data, void **ret_data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
spin_lock(&dlm->spinlock);
set_bit(node, dlm->exit_domain_map);
spin_unlock(&dlm->spinlock);
dlm_put(dlm);
return 0;
}
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
......@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
......@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
node, dlm->name, dlm->node_num);
mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
&leave_msg, sizeof(leave_msg), node,
NULL);
status = o2net_send_message(msg_type, dlm->key, &leave_msg,
sizeof(leave_msg), node, NULL);
if (status < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
mlog(0, "status return %d from o2net_send_message\n", status);
mlog(ML_ERROR, "Error %d sending domain exit message %u "
"to node %u on domain %s\n", status, msg_type, node,
dlm->name);
return status;
}
static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
{
int node = -1;
/* Support for begin exit domain was added in 1.2 */
if (dlm->dlm_locking_proto.pv_major == 1 &&
dlm->dlm_locking_proto.pv_minor < 2)
return;
/*
* Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
* informational. Meaning if a node does not receive the message,
* so be it.
*/
spin_lock(&dlm->spinlock);
while (1) {
node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
if (node >= O2NM_MAX_NODES)
break;
if (node == dlm->node_num)
continue;
spin_unlock(&dlm->spinlock);
dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
spin_lock(&dlm->spinlock);
}
spin_unlock(&dlm->spinlock);
}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
......@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
status = dlm_send_one_domain_exit(dlm, node);
status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
......@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
......@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
set_bit(assert->node_idx, dlm->domain_map);
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
......@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
......
......@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
dlm_lockres_put(res);
}
/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
* if not. If 0, numlocks is set to the number of locks in the lockres.
/*
* A migrateable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
int *numlocks,
int *hasrefs)
struct dlm_lock_resource *res)
{
int ret;
int i;
int count = 0;
enum dlm_lockres_list idx;
int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
u64 cookie;
assert_spin_locked(&res->spinlock);
*numlocks = 0;
*hasrefs = 0;
ret = -EINVAL;
if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "cannot migrate lockres with unknown owner!\n");
goto leave;
}
if (res->owner != dlm->node_num) {
mlog(0, "cannot migrate lockres this node doesn't own!\n");
goto leave;
}
if (res->owner != dlm->node_num)
return 0;
ret = 0;
queue = &res->granted;
for (i = 0; i < 3; i++) {
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
++count;
if (lock->ml.node == dlm->node_num) {
mlog(0, "found a lock owned by this node still "
"on the %s queue! will not migrate this "
"lockres\n", (i == 0 ? "granted" :
(i == 1 ? "converting" :
"blocked")));
ret = -ENOTEMPTY;
goto leave;
if (lock->ml.node != dlm->node_num) {
nonlocal++;
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
dlm_get_lock_cookie_seq(cookie),
dlm_list_in_text(idx));
return 0;
}
queue++;
}
*numlocks = count;
count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (count < O2NM_MAX_NODES)
*hasrefs = 1;
if (!nonlocal) {
node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
res->lockname.name);
leave:
return ret;
return 1;
}
/*
......@@ -2406,8 +2396,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 target)
struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
......@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
int numlocks, hasrefs;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
BUG_ON(target == O2NM_MAX_NODES);
name = res->lockname.name;
namelen = res->lockname.len;
mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);