Commit ccd979bd authored by Mark Fasheh's avatar Mark Fasheh Committed by Joel Becker

[PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module.
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: default avatarKurt Hackel <kurt.hackel@oracle.com>
parent 8df08c89
......@@ -36,6 +36,8 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
proc.txt
- info on Linux's /proc filesystem.
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
romfs.txt
- Description of the ROMFS filesystem.
smbfs.txt
......
OCFS2 filesystem
==================
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
numbers, and has automatically extending metadata groups which may
also make it attractive for non-clustered use.
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2" and "ocfs2_hb_ctl".
Project web page: http://oss.oracle.com/projects/ocfs2
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
All code copyright 2005 Oracle except when otherwise noted.
CREDITS:
Lots of code taken from ext3 and other projects.
Authors in alphabetical order:
Joel Becker <joel.becker@oracle.com>
Zach Brown <zach.brown@oracle.com>
Mark Fasheh <mark.fasheh@oracle.com>
Kurt Hackel <kurt.hackel@oracle.com>
Sunil Mushran <sunil.mushran@oracle.com>
Manish Singh <manish.singh@oracle.com>
Caveats
=======
Features which OCFS2 does not support yet:
- sparse files
- extended attributes
- shared writeable mmap
- loopback is supported, but data written will not
be cluster coherent.
- quotas
- cluster aware flock
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- POSIX ACLs
- readpages / writepages (not user visible)
Mount options
=============
OCFS2 supports the following mount options:
(*) == default
barrier=1 This enables/disables barriers. barrier=0 disables it,
barrier=1 enables it.
errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.
......@@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com
L: linux-nvidia@lists.surfsouth.com
S: Maintained
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
P: Mark Fasheh
M: mark.fasheh@oracle.com
P: Kurt Hackel
M: kurt.hackel@oracle.com
L: ocfs2-devel@oss.oracle.com
W: http://oss.oracle.com/projects/ocfs2/
S: Supported
OLYMPIC NETWORK DRIVER
P: Peter De Shrijver
M: p2@ace.ulyssis.student.kuleuven.ac.be
......
EXTRA_CFLAGS += -Ifs/ocfs2
EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += ocfs2.o
ocfs2-objs := \
alloc.o \
aops.o \
buffer_head_io.o \
dcache.o \
dir.o \
dlmglue.o \
export.o \
extent_map.o \
file.o \
heartbeat.o \
inode.o \
journal.o \
localalloc.o \
mmap.o \
namei.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
ver.o \
vote.o
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS) += dlm/
This diff is collapsed.
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* alloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 blkno,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_dinode *fe);
/* how many new metadata chunks would an allocation need at maximum? */
static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
{
/*
* Rather than do all the work of determining how much we need
* (involves a ton of reads and locks), just ask for the
* maximal limit. That's a tree depth shift. So, one block for
* level of the tree (current l_tree_depth), one block for the
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel);
int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
int slot_num,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
struct ocfs2_truncate_context {
struct inode *tc_ext_alloc_inode;
struct buffer_head *tc_ext_alloc_bh;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
#endif /* OCFS2_ALLOC_H */
This diff is collapsed.
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to);
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_rw_locked(iocb) \
set_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(0, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* io.c
*
* Buffer cache handling
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
#include "buffer_head_io.h"
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
struct inode *inode)
{
int ret = 0;
mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
(unsigned long long)bh->b_blocknr, inode);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
/* No need to check for a soft readonly file system here. non
* journalled writes are only ever done on system files which
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
goto out;
}
down(&OCFS2_I(inode)->ip_io_sem);
lock_buffer(bh);
set_buffer_uptodate(bh);
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
ocfs2_set_buffer_uptodate(inode, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
brelse(bh);
}
up(&OCFS2_I(inode)->ip_io_sem);
out:
mlog_exit(ret);
return ret;
}
int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
struct buffer_head *bhs[], int flags,
struct inode *inode)
{
int status = 0;
struct super_block *sb;
int i, ignore_cache = 0;
struct buffer_head *bh;
mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
block, nr, flags, inode);
if (osb == NULL || osb->sb == NULL || bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr < 0) {
mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr == 0) {
mlog(ML_BH_IO, "No buffers will be read!\n");
status = 0;
goto bail;
}
sb = osb->sb;
if (flags & OCFS2_BH_CACHED && !inode)
flags &= ~OCFS2_BH_CACHED;
if (inode)
down(&OCFS2_I(inode)->ip_io_sem);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
status = -EIO;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
ignore_cache = 0;
if (flags & OCFS2_BH_CACHED &&
!ocfs2_buffer_uptodate(inode, bh)) {
mlog(ML_UPTODATE,
"bh (%llu), inode %"MLFu64" not uptodate\n",
(unsigned long long)bh->b_blocknr,
OCFS2_I(inode)->ip_blkno);
ignore_cache = 1;
}
/* XXX: Can we ever get this and *not* have the cached
* flag set? */
if (buffer_jbd(bh)) {
if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
mlog(ML_BH_IO, "trying to sync read a jbd "
"managed bh (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
mlog(ML_BH_IO, "asking me to sync read a dirty "
"buffer! (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
lock_buffer(bh);
if (buffer_jbd(bh)) {
#ifdef CATCH_BH_JBD_RACES
mlog(ML_ERROR, "block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
#else
unlock_buffer(bh);
continue;
#endif
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
if (flags & OCFS2_BH_READAHEAD)
submit_bh(READA, bh);
else
submit_bh(READ, bh);
continue;
}
}
status = 0;
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
/* We know this can't have changed as we hold the
* inode sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* Status won't be cleared from here on out,
* so we can safely record this and loop back
* to cleanup the other buffers. Don't need to
* remove the clustered uptodate information
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
brelse(bh);
bhs[i] = NULL;
continue;
}
if (inode)
ocfs2_set_buffer_uptodate(inode, bh);
}
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
(!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
bail:
mlog_exit(status);
return status;
}
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_buffer_head.h
*
* Buffer cache handling functions defined
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_BUFFER_HEAD_IO_H
#define OCFS2_BUFFER_HEAD_IO_H
#include <linux/buffer_head.h>
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
static inline int ocfs2_read_block(struct ocfs2_super *osb,
u64 off,
struct buffer_head **bh,
int flags,
struct inode *inode);
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
int ocfs2_read_blocks(struct ocfs2_super *osb,
u64 block,
int nr,
struct buffer_head *bhs[],
int flags,
struct inode *inode);
#define OCFS2_BH_CACHED 1
#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
struct buffer_head **bh, int flags,
struct inode *inode)
{
int status = 0;
if (bh == NULL) {
printk("ocfs2: bh == NULL\n");
status = -EINVAL;
goto bail;
}
status = ocfs2_read_blocks(osb, off, 1, bh,
flags, inode);
bail:
return status;
}
#endif /* OCFS2_BUFFER_HEAD_IO_H */
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.c
*
* dentry cache handling code
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/namei.h>
#define MLOG_MASK_PREFIX ML_DCACHE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dcache.h"
#include "file.h"
#include "inode.h"
static int ocfs2_dentry_revalidate(struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
int ret = 0; /* if all else fails, just return false */
struct ocfs2_super *osb;
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
/* Never trust a negative dentry - force a new lookup. */
if (inode == NULL) {
mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
dentry->d_name.name);
goto bail;
}
osb = OCFS2_SB(inode->i_sb);
BUG_ON(!osb);
if (inode != osb->root_inode) {
spin_lock(&OCFS2_I(inode)->ip_lock);
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
mlog(0, "inode (%"MLFu64") deleted, returning false\n",
OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (!inode->i_nlink) {
mlog(0, "Inode %"MLFu64" orphaned, returning false "
"dir = %d\n", OCFS2_I(inode)->ip_blkno,
S_ISDIR(inode->i_mode));
goto bail;
}
}
ret = 1;
bail:
mlog_exit(ret);
return ret;
}
struct dentry_operations ocfs2_dentry_ops = {
.d_revalidate = ocfs2_dentry_revalidate,
};
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DCACHE_H
#define OCFS2_DCACHE_H
extern struct dentry_operations ocfs2_dentry_ops;
#endif /* OCFS2_DCACHE_H */
This diff is collapsed.
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dir.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either