Commit 15da849c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.5/dm-fixes' of...

Merge tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - Fix DM multipath by restoring full path selector functionality for
   bio-based configurations that don't haave a SCSI device handler.

 - Fix dm-btree removal to ensure non-root btree nodes have at least
   (max_entries / 3) entries. This resolves userspace thin_check
   utility's report of "too few entries in btree_node".

 - Fix both the DM thin-provisioning and dm-clone targets to properly
   flush the data device prior to metadata commit. This resolves the
   potential for inconsistency across a power loss event when the data
   device has a volatile writeback cache.

 - Small documentation fixes to dm-clone and dm-integrity.

* tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  docs: dm-integrity: remove reference to ARC4
  dm thin: Flush data device before committing metadata
  dm thin metadata: Add support for a pre-commit callback
  dm clone: Flush destination device before committing metadata
  dm clone metadata: Use a two phase commit
  dm clone metadata: Track exact changes per transaction
  dm btree: increase rebalance threshold in __rebalance2()
  dm: add dm-clone to the documentation index
  dm mpath: remove harmful bio-based optimization
parents 22ff311a 7fc979f8
......@@ -144,7 +144,7 @@ journal_crypt:algorithm(:key) (the key is optional)
Encrypt the journal using given algorithm to make sure that the
attacker can't read the journal. You can use a block cipher here
(such as "cbc(aes)") or a stream cipher (for example "chacha20",
"salsa20", "ctr(aes)" or "ecb(arc4)").
"salsa20" or "ctr(aes)").
The journal contains history of last writes to the block device,
an attacker reading the journal could see the last sector nubmers
......
......@@ -8,6 +8,7 @@ Device Mapper
cache-policies
cache
delay
dm-clone
dm-crypt
dm-dust
dm-flakey
......
......@@ -67,23 +67,34 @@ struct superblock_disk {
* To save constantly doing look ups on disk we keep an in core copy of the
* on-disk bitmap, the region_map.
*
* To further reduce metadata I/O overhead we use a second bitmap, the dmap
* (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
* In order to track which regions are hydrated during a metadata transaction,
* we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
* bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
* tracks the regions that got hydrated during the current metadata
* transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
* the dirty_regions bitmap.
*
* This allows us to precisely track the regions that were hydrated during the
* current metadata transaction and update the metadata accordingly, when we
* commit the current transaction. This is important because dm-clone should
* only commit the metadata of regions that were properly flushed to the
* destination device beforehand. Otherwise, in case of a crash, we could end
* up with a corrupted dm-clone device.
*
* When a region finishes hydrating dm-clone calls
* dm_clone_set_region_hydrated(), or for discard requests
* dm_clone_cond_set_range(), which sets the corresponding bits in region_map
* and dmap.
*
* During a metadata commit we scan the dmap for dirty region_map words (longs)
* and update accordingly the on-disk metadata. Thus, we don't have to flush to
* disk the whole region_map. We can just flush the dirty region_map words.
* During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
* and update the on-disk metadata accordingly. Thus, we don't have to flush to
* disk the whole region_map. We can just flush the dirty region_map bits.
*
* We use a dirty bitmap, which is smaller than the original region_map, to
* reduce the amount of memory accesses during a metadata commit. As dm-bitset
* accesses the on-disk bitmap in 64-bit word granularity, there is no
* significant benefit in tracking the dirty region_map bits with a smaller
* granularity.
* We use the helper dmap->dirty_words bitmap, which is smaller than the
* original region_map, to reduce the amount of memory accesses during a
* metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
* 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
* accesses.
*
* We could update directly the on-disk bitmap, when dm-clone calls either
* dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
......@@ -92,12 +103,13 @@ struct superblock_disk {
* e.g., in a hooked overwrite bio's completion routine, and further reduce the
* I/O completion latency.
*
* We maintain two dirty bitmaps. During a metadata commit we atomically swap
* the currently used dmap with the unused one. This allows the metadata update
* functions to run concurrently with an ongoing commit.
* We maintain two dirty bitmap sets. During a metadata commit we atomically
* swap the currently used dmap with the unused one. This allows the metadata
* update functions to run concurrently with an ongoing commit.
*/
struct dirty_map {
unsigned long *dirty_words;
unsigned long *dirty_regions;
unsigned int changed;
};
......@@ -115,6 +127,9 @@ struct dm_clone_metadata {
struct dirty_map dmap[2];
struct dirty_map *current_dmap;
/* Protected by lock */
struct dirty_map *committing_dmap;
/*
* In core copy of the on-disk bitmap to save constantly doing look ups
* on disk.
......@@ -461,34 +476,53 @@ static size_t bitmap_size(unsigned long nr_bits)
return BITS_TO_LONGS(nr_bits) * sizeof(long);
}
static int dirty_map_init(struct dm_clone_metadata *cmd)
static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
unsigned long nr_regions)
{
cmd->dmap[0].changed = 0;
cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
dmap->changed = 0;
if (!cmd->dmap[0].dirty_words) {
DMERR("Failed to allocate dirty bitmap");
dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
if (!dmap->dirty_words)
return -ENOMEM;
dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
if (!dmap->dirty_regions) {
kvfree(dmap->dirty_words);
return -ENOMEM;
}
cmd->dmap[1].changed = 0;
cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
return 0;
}
static void __dirty_map_exit(struct dirty_map *dmap)
{
kvfree(dmap->dirty_words);
kvfree(dmap->dirty_regions);
}
static int dirty_map_init(struct dm_clone_metadata *cmd)
{
if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
DMERR("Failed to allocate dirty bitmap");
return -ENOMEM;
}
if (!cmd->dmap[1].dirty_words) {
if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
DMERR("Failed to allocate dirty bitmap");
kvfree(cmd->dmap[0].dirty_words);
__dirty_map_exit(&cmd->dmap[0]);
return -ENOMEM;
}
cmd->current_dmap = &cmd->dmap[0];
cmd->committing_dmap = NULL;
return 0;
}
static void dirty_map_exit(struct dm_clone_metadata *cmd)
{
kvfree(cmd->dmap[0].dirty_words);
kvfree(cmd->dmap[1].dirty_words);
__dirty_map_exit(&cmd->dmap[0]);
__dirty_map_exit(&cmd->dmap[1]);
}
static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
......@@ -633,21 +667,23 @@ unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd
return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
}
static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
static int __update_metadata_word(struct dm_clone_metadata *cmd,
unsigned long *dirty_regions,
unsigned long word)
{
int r;
unsigned long index = word * BITS_PER_LONG;
unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
while (index < max_index) {
if (test_bit(index, cmd->region_map)) {
if (test_bit(index, dirty_regions)) {
r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
index, &cmd->bitset_root);
if (r) {
DMERR("dm_bitset_set_bit failed");
return r;
}
__clear_bit(index, dirty_regions);
}
index++;
}
......@@ -721,7 +757,7 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
if (word == cmd->nr_words)
break;
r = __update_metadata_word(cmd, word);
r = __update_metadata_word(cmd, dmap->dirty_regions, word);
if (r)
return r;
......@@ -743,15 +779,17 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
return 0;
}
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
{
int r = -EPERM;
int r = 0;
struct dirty_map *dmap, *next_dmap;
down_write(&cmd->lock);
if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
r = -EPERM;
goto out;
}
/* Get current dirty bitmap */
dmap = cmd->current_dmap;
......@@ -763,7 +801,7 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
* The last commit failed, so we don't have a clean dirty-bitmap to
* use.
*/
if (WARN_ON(next_dmap->changed)) {
if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
r = -EINVAL;
goto out;
}
......@@ -773,11 +811,33 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
cmd->current_dmap = next_dmap;
spin_unlock_irq(&cmd->bitmap_lock);
/*
* No one is accessing the old dirty bitmap anymore, so we can flush
* it.
*/
r = __flush_dmap(cmd, dmap);
/* Set old dirty bitmap as currently committing */
cmd->committing_dmap = dmap;
out:
up_write(&cmd->lock);
return r;
}
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
{
int r = -EPERM;
down_write(&cmd->lock);
if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
goto out;
if (WARN_ON(!cmd->committing_dmap)) {
r = -EINVAL;
goto out;
}
r = __flush_dmap(cmd, cmd->committing_dmap);
if (!r) {
/* Clear committing dmap */
cmd->committing_dmap = NULL;
}
out:
up_write(&cmd->lock);
......@@ -802,6 +862,7 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
dmap = cmd->current_dmap;
__set_bit(word, dmap->dirty_words);
__set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
......@@ -830,6 +891,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
if (!test_bit(region_nr, cmd->region_map)) {
word = region_nr / BITS_PER_LONG;
__set_bit(word, dmap->dirty_words);
__set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
}
......
......@@ -75,7 +75,23 @@ void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
/*
* Commit dm-clone metadata to disk.
*
* We use a two phase commit:
*
* 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
* committing. After this is called, all subsequent metadata updates, done
* through either dm_clone_set_region_hydrated() or
* dm_clone_cond_set_range(), will be part of the **next** transaction.
*
* 2. dm_clone_metadata_commit(): Actually commit the current transaction to
* disk and start a new transaction.
*
* This allows dm-clone to flush the destination device after step (1) to
* ensure that all freshly hydrated regions, for which we are updating the
* metadata, are properly written to non-volatile storage and won't be lost in
* case of a crash.
*/
int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
/*
......@@ -112,6 +128,7 @@ int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
* Switches metadata to a read only mode. Once read-only mode has been entered
* the following functions will return -EPERM:
*
* dm_clone_metadata_pre_commit()
* dm_clone_metadata_commit()
* dm_clone_set_region_hydrated()
* dm_clone_cond_set_range()
......
......@@ -86,6 +86,12 @@ struct clone {
struct dm_clone_metadata *cmd;
/*
* bio used to flush the destination device, before committing the
* metadata.
*/
struct bio flush_bio;
/* Region hydration hash table */
struct hash_table_bucket *ht;
......@@ -1108,10 +1114,13 @@ static bool need_commit_due_to_time(struct clone *clone)
/*
* A non-zero return indicates read-only or fail mode.
*/
static int commit_metadata(struct clone *clone)
static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
{
int r = 0;
if (dest_dev_flushed)
*dest_dev_flushed = false;
mutex_lock(&clone->commit_lock);
if (!dm_clone_changed_this_transaction(clone->cmd))
......@@ -1122,8 +1131,26 @@ static int commit_metadata(struct clone *clone)
goto out;
}
r = dm_clone_metadata_commit(clone->cmd);
r = dm_clone_metadata_pre_commit(clone->cmd);
if (unlikely(r)) {
__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
goto out;
}
bio_reset(&clone->flush_bio);
bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
r = submit_bio_wait(&clone->flush_bio);
if (unlikely(r)) {
__metadata_operation_failed(clone, "flush destination device", r);
goto out;
}
if (dest_dev_flushed)
*dest_dev_flushed = true;
r = dm_clone_metadata_commit(clone->cmd);
if (unlikely(r)) {
__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
goto out;
......@@ -1194,6 +1221,7 @@ static void process_deferred_bios(struct clone *clone)
static void process_deferred_flush_bios(struct clone *clone)
{
struct bio *bio;
bool dest_dev_flushed;
struct bio_list bios = BIO_EMPTY_LIST;
struct bio_list bio_completions = BIO_EMPTY_LIST;
......@@ -1213,7 +1241,7 @@ static void process_deferred_flush_bios(struct clone *clone)
!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
return;
if (commit_metadata(clone)) {
if (commit_metadata(clone, &dest_dev_flushed)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
......@@ -1227,8 +1255,17 @@ static void process_deferred_flush_bios(struct clone *clone)
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
while ((bio = bio_list_pop(&bios))) {
if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
/* We just flushed the destination device as part of
* the metadata commit, so there is no reason to send
* another flush.
*/
bio_endio(bio);
} else {
generic_make_request(bio);
}
}
}
static void do_worker(struct work_struct *work)
......@@ -1400,7 +1437,7 @@ static void clone_status(struct dm_target *ti, status_type_t type,
/* Commit to ensure statistics aren't out-of-date */
if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
(void) commit_metadata(clone);
(void) commit_metadata(clone, NULL);
r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
......@@ -1834,6 +1871,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bio_list_init(&clone->deferred_flush_completions);
clone->hydration_offset = 0;
atomic_set(&clone->hydrations_in_flight, 0);
bio_init(&clone->flush_bio, NULL, 0);
clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!clone->wq) {
......@@ -1907,6 +1945,7 @@ static void clone_dtr(struct dm_target *ti)
struct clone *clone = ti->private;
mutex_destroy(&clone->commit_lock);
bio_uninit(&clone->flush_bio);
for (i = 0; i < clone->nr_ctr_args; i++)
kfree(clone->ctr_args[i]);
......@@ -1961,7 +2000,7 @@ static void clone_postsuspend(struct dm_target *ti)
wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
flush_workqueue(clone->wq);
(void) commit_metadata(clone);
(void) commit_metadata(clone, NULL);
}
static void clone_resume(struct dm_target *ti)
......
......@@ -599,45 +599,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
return pgpath;
}
static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio)
{
struct pgpath *pgpath;
unsigned long flags;
/* Do we need to select a new pgpath? */
/*
* FIXME: currently only switching path if no path (due to failure, etc)
* - which negates the point of using a path selector
*/
pgpath = READ_ONCE(m->current_pgpath);
if (!pgpath)
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
if (!pgpath) {
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
/* Queue for the daemon to resubmit */
spin_lock_irqsave(&m->lock, flags);
bio_list_add(&m->queued_bios, bio);
spin_unlock_irqrestore(&m->lock, flags);
queue_work(kmultipathd, &m->process_queued_bios);
return ERR_PTR(-EAGAIN);
}
return NULL;
}
return pgpath;
}
static int __multipath_map_bio(struct multipath *m, struct bio *bio,
struct dm_mpath_io *mpio)
{
struct pgpath *pgpath;
if (!m->hw_handler_name)
pgpath = __map_bio_fast(m, bio);
else
pgpath = __map_bio(m, bio);
struct pgpath *pgpath = __map_bio(m, bio);
if (IS_ERR(pgpath))
return DM_MAPIO_SUBMITTED;
......
......@@ -188,6 +188,15 @@ struct dm_pool_metadata {
unsigned long flags;
sector_t data_block_size;
/*
* Pre-commit callback.
*
* This allows the thin provisioning target to run a callback before
* the metadata are committed.
*/
dm_pool_pre_commit_fn pre_commit_fn;
void *pre_commit_context;
/*
* We reserve a section of the metadata for commit overhead.
* All reported space does *not* include this.
......@@ -826,6 +835,14 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (unlikely(!pmd->in_service))
return 0;
if (pmd->pre_commit_fn) {
r = pmd->pre_commit_fn(pmd->pre_commit_context);
if (r < 0) {
DMERR("pre-commit callback failed");
return r;
}
}
r = __write_changed_details(pmd);
if (r < 0)
return r;
......@@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
pmd->pre_commit_fn = NULL;
pmd->pre_commit_context = NULL;
r = __create_persistent_data_objects(pmd, format_device);
if (r) {
......@@ -2044,6 +2063,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
return r;
}
void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
dm_pool_pre_commit_fn fn,
void *context)
{
pmd_write_lock_in_core(pmd);
pmd->pre_commit_fn = fn;
pmd->pre_commit_context = context;
pmd_write_unlock(pmd);
}
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
......
......@@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
*/
void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
/* Pre-commit callback */
typedef int (*dm_pool_pre_commit_fn)(void *context);
void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
dm_pool_pre_commit_fn fn,
void *context);
/*----------------------------------------------------------------*/
#endif
......@@ -328,6 +328,7 @@ struct pool_c {
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
struct bio flush_bio;
};
/*
......@@ -2383,8 +2384,16 @@ static void process_deferred_bios(struct pool *pool)
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
while ((bio = bio_list_pop(&bios))) {
/*
* The data device was flushed as part of metadata commit,
* so complete redundant flushes immediately.
*/
if (bio->bi_opf & REQ_PREFLUSH)
bio_endio(bio);
else
generic_make_request(bio);
}
}
static void do_worker(struct work_struct *ws)
......@@ -3115,6 +3124,7 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
......@@ -3180,6 +3190,29 @@ static void metadata_low_callback(void *context)
dm_table_event(pool->ti->table);
}
/*
* We need to flush the data device **before** committing the metadata.
*
* This ensures that the data blocks of any newly inserted mappings are
* properly written to non-volatile storage and won't be lost in case of a
* crash.
*
* Failure to do so can result in data corruption in the case of internal or
* external snapshots and in the case of newly provisioned blocks, when block
* zeroing is enabled.
*/
static int metadata_pre_commit_callback(void *context)
{
struct pool_c *pt = context;
struct bio *flush_bio = &pt->flush_bio;
bio_reset(flush_bio);
bio_set_dev(flush_bio, pt->data_dev->bdev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio);
}
static sector_t get_dev_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
......@@ -3348,6 +3381,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
......@@ -3374,6 +3408,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
dm_pool_register_pre_commit_callback(pt->pool->pmd,
metadata_pre_commit_callback,
pt);