Commit 5c83746a authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Trond Myklebust
Browse files

pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing



This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
as the management of complex devices.  The reason for that is we might have
multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
device mapper or md can't handle as they claim devices exclusively.

But as is turns out simple striping / concatenation is fairly trivial to
implement anyway, so we make our life simpler by reducing the reliance
on blkmapd.  For now we still use blkmapd by feeding it synthetic SIMPLE
device XDR to translate device signatures to device numbers, but in the
long runs I have plans to eliminate it entirely.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarTrond Myklebust <trond.myklebust@primarydata.com>
parent 871760ce
......@@ -3,4 +3,4 @@
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
......@@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio)
return NULL;
}
static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
struct parallel_io *par)
static struct bio *
bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
struct pnfs_block_dev *dev =
container_of(be->be_device, struct pnfs_block_dev, d_node);
struct bio *bio;
npg = min(npg, BIO_MAX_PAGES);
......@@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}
if (bio) {
bio->bi_iter.bi_sector = isect - be->be_f_offset +
be->be_v_offset;
bio->bi_bdev = dev->d_bdev;
bio->bi_iter.bi_sector = disk_sector;
bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}
static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
struct parallel_io *par,
unsigned int offset, int len)
static struct bio *
do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
struct page *page, struct pnfs_block_dev_map *map,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
struct parallel_io *par, unsigned int offset, int *len)
{
isect = isect + (offset >> SECTOR_SHIFT);
struct pnfs_block_dev *dev =
container_of(be->be_device, struct pnfs_block_dev, node);
u64 disk_addr, end;
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
npg, rw, (unsigned long long)isect, offset, len);
npg, rw, (unsigned long long)isect, offset, *len);
/* translate to device offset */
isect += be->be_v_offset;
isect -= be->be_f_offset;
/* translate to physical disk offset */
disk_addr = (u64)isect << SECTOR_SHIFT;
if (disk_addr < map->start || disk_addr >= map->start + map->len) {
if (!dev->map(dev, disk_addr, map))
return ERR_PTR(-EIO);
bio = bl_submit_bio(rw, bio);
}
disk_addr += map->disk_offset;
disk_addr -= map->start;
/* limit length to what the device mapping allows */
end = disk_addr + *len;
if (end >= map->start + map->len)
*len = map->start + map->len - disk_addr;
retry:
if (!bio) {
bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
bio = bl_alloc_init_bio(npg, map->bdev,
disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
if (bio_add_page(bio, page, len, offset) < len) {
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
......@@ -203,6 +223,7 @@ static enum pnfs_try_status
bl_read_pagelist(struct nfs_pgio_header *header)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
......@@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
f_offset += pg_len;
bytes_left -= pg_len;
isect += (pg_offset >> SECTOR_SHIFT);
extent_length -= (pg_offset >> SECTOR_SHIFT);
} else {
BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}
isect += (pg_offset >> SECTOR_SHIFT);
extent_length -= (pg_offset >> SECTOR_SHIFT);
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
/* invalidate map */
map.start = NFS4_MAX_UINT64;
} else {
bio = do_add_page_to_bio(bio,
header->page_array.npages - i,
READ,
isect, pages[i], &be,
isect, pages[i], &map, &be,
bl_end_io_read, par,
pg_offset, pg_len);
pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
......@@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header)
}
isect += (pg_len >> SECTOR_SHIFT);
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
......@@ -346,6 +370,7 @@ static enum pnfs_try_status
bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
......@@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
size_t count = header->args.count;
struct page **pages = header->args.pages;
int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
......@@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
extent_length = be.be_length - (isect - be.be_f_offset);
}
pg_len = PAGE_CACHE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
WRITE, isect, pages[i], &be,
WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
0, PAGE_CACHE_SIZE);
0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
offset += PAGE_CACHE_SIZE;
count -= PAGE_CACHE_SIZE;
isect += PAGE_CACHE_SECTORS;
extent_length -= PAGE_CACHE_SECTORS;
offset += pg_len;
count -= pg_len;
isect += (pg_len >> SECTOR_SHIFT);
extent_length -= (pg_len >> SECTOR_SHIFT);
}
header->res.count = header->args.count;
......
......@@ -44,9 +44,77 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct pnfs_block_dev;
enum pnfs_block_volume_type {
PNFS_BLOCK_VOLUME_SIMPLE = 0,
PNFS_BLOCK_VOLUME_SLICE = 1,
PNFS_BLOCK_VOLUME_CONCAT = 2,
PNFS_BLOCK_VOLUME_STRIPE = 3,
};
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
* Not actually limited by the protocol.
*/
#define PNFS_BLOCK_UUID_LEN 128
struct pnfs_block_volume {
enum pnfs_block_volume_type type;
union {
struct {
int len;
int nr_sigs;
struct {
u64 offset;
u32 sig_len;
u8 sig[PNFS_BLOCK_UUID_LEN];
} sigs[PNFS_BLOCK_MAX_UUIDS];
} simple;
struct {
u64 start;
u64 len;
u32 volume;
} slice;
struct {
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} concat;
struct {
u64 chunk_size;
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} stripe;
};
};
struct pnfs_block_dev_map {
sector_t start;
sector_t len;
sector_t disk_offset;
struct block_device *bdev;
};
struct pnfs_block_dev {
struct nfs4_deviceid_node d_node;
struct block_device *d_bdev;
struct nfs4_deviceid_node node;
u64 start;
u64 len;
u32 nr_children;
struct pnfs_block_dev *children;
u64 chunk_size;
struct block_device *bdev;
u64 disk_offset;
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
enum exstate4 {
......@@ -110,6 +178,11 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
/* dev.c */
struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_mask);
void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
......@@ -123,10 +196,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
/* rpc_pipefs.c */
struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_mask);
void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
dev_t bl_resolve_deviceid(struct nfs_server *server,
struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
void __exit bl_cleanup_pipefs(void);
......
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/blkdev.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_xdr.h>
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
static void
bl_free_device(struct pnfs_block_dev *dev)
{
if (dev->nr_children) {
int i;
for (i = 0; i < dev->nr_children; i++)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
if (dev->bdev)
blkdev_put(dev->bdev, FMODE_READ);
}
}
void
bl_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct pnfs_block_dev *dev =
container_of(d, struct pnfs_block_dev, node);
bl_free_device(dev);
kfree(dev);
}
static int
nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
{
__be32 *p;
int i;
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->type = be32_to_cpup(p++);
switch (b->type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++);
if (!b->simple.nr_sigs) {
dprintk("no signature\n");
return -EIO;
}
b->simple.len = 4 + 4;
for (i = 0; i < b->simple.nr_sigs; i++) {
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
return -EIO;
memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len);
b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
}
break;
case PNFS_BLOCK_VOLUME_SLICE:
p = xdr_inline_decode(xdr, 8 + 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->slice.start);
p = xdr_decode_hyper(p, &b->slice.len);
b->slice.volume = be32_to_cpup(p++);
break;
case PNFS_BLOCK_VOLUME_CONCAT:
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->concat.volumes_count = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p)
return -EIO;
for (i = 0; i < b->concat.volumes_count; i++)
b->concat.volumes[i] = be32_to_cpup(p++);
break;
case PNFS_BLOCK_VOLUME_STRIPE:
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p)
return -EIO;
for (i = 0; i < b->stripe.volumes_count; i++)
b->stripe.volumes[i] = be32_to_cpup(p++);
break;
default:
dprintk("unknown volume type!\n");
return -EIO;
}
return 0;
}
static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
map->bdev = dev->bdev;
return true;
}
static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
int i;
for (i = 0; i < dev->nr_children; i++) {
struct pnfs_block_dev *child = &dev->children[i];
if (child->start > offset ||
child->start + child->len <= offset)
continue;
child->map(child, offset - child->start, map);
return true;
}
dprintk("%s: ran off loop!\n", __func__);
return false;
}
static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
struct pnfs_block_dev *child;
u64 chunk = (offset / dev->chunk_size);
int chunk_idx = chunk % dev->nr_children;
u64 disk_offset;
if (chunk_idx > dev->nr_children) {
dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
__func__, chunk_idx, offset, dev->chunk_size);
/* error, should not happen */
return false;
}
/* truncate offset to the beginning of the stripe */
offset = chunk * dev->chunk_size;
/* disk offset of the stripe */
disk_offset = offset / dev->nr_children;
child = &dev->children[chunk_idx];
child->map(child, disk_offset, map);
map->start += offset;
map->disk_offset += disk_offset;
map->len = dev->chunk_size;
return true;
}
static int
bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
static int
bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
return PTR_ERR(d->bdev);
}
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
d->bdev->bd_disk->disk_name);
return 0;
}
static int
bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
int ret;
ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
if (ret)
return ret;
d->disk_offset = v->slice.start;
d->len = v->slice.len;
return 0;
}
static int
bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
u64 len = 0;
int ret, i;
d->children = kcalloc(v->concat.volumes_count,
sizeof(struct pnfs_block_dev), GFP_KERNEL);
if (!d->children)
return -ENOMEM;
for (i = 0; i < v->concat.volumes_count; i++) {
ret = bl_parse_deviceid(server, &d->children[i],
volumes, v->concat.volumes[i], gfp_mask);
if (ret)
return ret;
d->nr_children++;
d->children[i].start += len;
len += d->children[i].len;
}
d->len = len;
d->map = bl_map_concat;
return 0;
}
static int
bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
u64 len = 0;
int ret, i;
d->children = kcalloc(v->stripe.volumes_count,
sizeof(struct pnfs_block_dev), GFP_KERNEL);
if (!d->children)
return -ENOMEM;
for (i = 0; i < v->stripe.volumes_count; i++) {
ret = bl_parse_deviceid(server, &d->children[i],
volumes, v->stripe.volumes[i], gfp_mask);
if (ret)
return ret;
d->nr_children++;
len += d->children[i].len;
}
d->len = len;
d->chunk_size = v->stripe.chunk_size;
d->map = bl_map_stripe;
return 0;
}
static int
bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
switch (volumes[idx].type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
return bl_parse_simple(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_SLICE:
return bl_parse_slice(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_CONCAT:
return bl_parse_concat(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_STRIPE:
return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
default:
dprintk("unsupported volume type: %d\n", volumes[idx].type);
return -EIO;
}
}
struct nfs4_deviceid_node *
bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
gfp_t gfp_mask)
{
struct nfs4_deviceid_node *node = NULL;
struct pnfs_block_volume *volumes;
struct pnfs_block_dev *top;
struct xdr_stream xdr;