From 86a3d5c6889594b814d47a80e366aa4831676199 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:41 -0600 Subject: [PATCH 01/38] block: Add .bdrv_co_block_status() callback We are gradually moving away from sector-based interfaces, towards byte-based. Now that the block layer exposes byte-based allocation, it's time to tackle the drivers. Add a new callback that operates on as small as byte boundaries. Subsequent patches will then update individual drivers, then finally remove .bdrv_co_get_block_status(). The new code also passes through the 'want_zero' hint, which will allow subsequent patches to further optimize callers that only care about how much of the image is allocated (want_zero is false), rather than full details about runs of zeroes and which offsets the allocation actually maps to (want_zero is true). As part of this effort, fix another part of the documentation: the claim in commit 4c41cb4 that BDRV_BLOCK_ALLOCATED is short for 'DATA || ZERO' is a lie at the block layer (see commit e88ae2264), even though it is how the bit is computed from the driver layer. After all, there are intentionally cases where we return ZERO but not ALLOCATED at the block layer, when we know that a read sees zero because the backing file is too short. Note that the driver interface is thus slightly different than the public interface with regards to which bits will be set, and what guarantees are provided on input. We also add an assertion that any driver using the new callback will make progress (the only time pnum will be 0 is if the block layer already handled an out-of-bounds request, or if there is an error); the old driver interface did not provide this guarantee, which could lead to some inf-loops in drastic corner-case failures. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 28 +++++++++++++++++++--------- include/block/block.h | 14 +++++++------- include/block/block_int.h | 20 +++++++++++++++----- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/block/io.c b/block/io.c index 89d0745e952f..b00c7e2e2c0e 100644 --- a/block/io.c +++ b/block/io.c @@ -1899,10 +1899,10 @@ int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs, * Drivers not implementing the functionality are assumed to not support * backing files, hence all their sectors are reported as allocated. * - * If 'want_zero' is true, the caller is querying for mapping purposes, - * and the result should include BDRV_BLOCK_OFFSET_VALID and - * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those - * bits particularly if it allows for a larger value in 'pnum'. + * If 'want_zero' is true, the caller is querying for mapping + * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and + * _ZERO where possible; otherwise, the result favors larger 'pnum', + * with a focus on accurate BDRV_BLOCK_ALLOCATED. * * If 'offset' is beyond the end of the disk image the return value is * BDRV_BLOCK_EOF and 'pnum' is set to 0. @@ -1959,7 +1959,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, /* Must be non-NULL or bdrv_getlength() would have failed */ assert(bs->drv); - if (!bs->drv->bdrv_co_get_block_status) { + if (!bs->drv->bdrv_co_get_block_status && !bs->drv->bdrv_co_block_status) { *pnum = bytes; ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; if (offset + bytes == total_size) { @@ -1976,13 +1976,14 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, bdrv_inc_in_flight(bs); /* Round out to request_alignment boundaries */ - /* TODO: until we have a byte-based driver callback, we also have to - * round out to sectors, even if that is bigger than request_alignment */ - align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE); + align = bs->bl.request_alignment; + if (bs->drv->bdrv_co_get_block_status && align < BDRV_SECTOR_SIZE) { + align = BDRV_SECTOR_SIZE; + } aligned_offset = QEMU_ALIGN_DOWN(offset, align); aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; - { + if (bs->drv->bdrv_co_get_block_status) { int count; /* sectors */ int64_t longret; @@ -2007,6 +2008,15 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, } ret = longret & ~BDRV_BLOCK_OFFSET_MASK; *pnum = count * BDRV_SECTOR_SIZE; + } else { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + if (ret < 0) { + *pnum = 0; + goto out; + } + assert(*pnum); /* The block driver must make progress */ } /* diff --git a/include/block/block.h b/include/block/block.h index 19b3ab9cb5ee..947e8876cdd7 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -115,19 +115,19 @@ typedef struct HDGeometry { * BDRV_BLOCK_ZERO: offset reads as zero * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this - * layer (short for DATA || ZERO), set by block layer - * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this layer + * layer rather than any backing, set by block layer + * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this + * layer, set by block layer * * Internal flag: * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request * that the block layer recompute the answer from the returned * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. * - * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 (BDRV_BLOCK_OFFSET_MASK) of - * the return value (old interface) or the entire map parameter (new - * interface) represent the offset in the returned BDS that is allocated for - * the corresponding raw data. However, whether that offset actually - * contains data also depends on BDRV_BLOCK_DATA, as follows: + * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the + * host offset within the returned BDS that is allocated for the + * corresponding raw guest data. However, whether that offset + * actually contains data also depends on BDRV_BLOCK_DATA, as follows: * * DATA ZERO OFFSET_VALID * t t t sectors read as zero, returned file is zero at offset diff --git a/include/block/block_int.h b/include/block/block_int.h index 5ea63f8fa8ad..c93722b43a4a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -202,15 +202,25 @@ struct BlockDriver { /* * Building block for bdrv_block_status[_above] and * bdrv_is_allocated[_above]. The driver should answer only - * according to the current layer, and should not set - * BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW. See block.h - * for the meaning of _DATA, _ZERO, and _OFFSET_VALID. The block - * layer guarantees input aligned to request_alignment, as well as - * non-NULL pnum and file. + * according to the current layer, and should only need to set + * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID, + * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing + * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See + * block.h for the overall meaning of the bits. As a hint, the + * flag want_zero is true if the caller cares more about precise + * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for + * overall allocation (favor larger *pnum, perhaps by reporting + * _DATA instead of _ZERO). The block layer guarantees input + * clamped to bdrv_getlength() and aligned to request_alignment, + * as well as non-NULL pnum, map, and file; in turn, the driver + * must return an error or set pnum to an aligned non-zero value. */ int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file); + int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); /* * Invalidate any cached meta-data. From e3efee828bc76e9780143f246fb0399eedd80c5e Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:42 -0600 Subject: [PATCH 02/38] nvme: Drop pointless .bdrv_co_get_block_status() Commit bdd6a90 has a bug: drivers should never directly set BDRV_BLOCK_ALLOCATED, but only io.c should do that (as needed). Instead, drivers should report BDRV_BLOCK_DATA if it knows that data comes from this BDS. But let's look at the bigger picture: semantically, the nvme driver is similar to the nbd, null, and raw drivers (no backing file, all data comes from this BDS). But while two of those other drivers have to supply the callback (null because it can special-case BDRV_BLOCK_ZERO, raw because it can special-case a different offset), in this case the block layer defaults are good enough without the callback at all (similar to nbd). So, fix the bug by deletion ;) Signed-off-by: Eric Blake Signed-off-by: Kevin Wolf --- block/nvme.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 75078022f6e3..8bca57aae695 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1072,18 +1072,6 @@ static int nvme_reopen_prepare(BDRVReopenState *reopen_state, return 0; } -static int64_t coroutine_fn nvme_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - *pnum = nb_sectors; - *file = bs; - - return BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_OFFSET_VALID | - (sector_num << BDRV_SECTOR_BITS); -} - static void nvme_refresh_filename(BlockDriverState *bs, QDict *opts) { QINCREF(opts); @@ -1183,8 +1171,6 @@ static BlockDriver bdrv_nvme = { .bdrv_co_flush_to_disk = nvme_co_flush, .bdrv_reopen_prepare = nvme_reopen_prepare, - .bdrv_co_get_block_status = nvme_co_get_block_status, - .bdrv_refresh_filename = nvme_refresh_filename, .bdrv_refresh_limits = nvme_refresh_limits, From 3e4d0e72b77b70578d5530af588d6f0484e18325 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:43 -0600 Subject: [PATCH 03/38] block: Switch passthrough drivers to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the generic helpers, and all passthrough clients (blkdebug, commit, mirror, throttle) accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/blkdebug.c | 20 +++++++++++--------- block/commit.c | 2 +- block/io.c | 36 ++++++++++++++++++++---------------- block/mirror.c | 2 +- block/throttle.c | 2 +- include/block/block_int.h | 28 ++++++++++++++++------------ 6 files changed, 50 insertions(+), 40 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index d83f23febd7f..589712475acf 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -627,15 +627,17 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs, return bdrv_co_pdiscard(bs->file->bs, offset, bytes); } -static int64_t coroutine_fn blkdebug_co_get_block_status( - BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, - BlockDriverState **file) +static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { - assert(QEMU_IS_ALIGNED(sector_num | nb_sectors, - DIV_ROUND_UP(bs->bl.request_alignment, - BDRV_SECTOR_SIZE))); - return bdrv_co_get_block_status_from_file(bs, sector_num, nb_sectors, - pnum, file); + assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment)); + return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes, + pnum, map, file); } static void blkdebug_close(BlockDriverState *bs) @@ -907,7 +909,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_co_flush_to_disk = blkdebug_co_flush, .bdrv_co_pwrite_zeroes = blkdebug_co_pwrite_zeroes, .bdrv_co_pdiscard = blkdebug_co_pdiscard, - .bdrv_co_get_block_status = blkdebug_co_get_block_status, + .bdrv_co_block_status = blkdebug_co_block_status, .bdrv_debug_event = blkdebug_debug_event, .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, diff --git a/block/commit.c b/block/commit.c index bb6c904704d3..1943c9c3e166 100644 --- a/block/commit.c +++ b/block/commit.c @@ -265,7 +265,7 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c, static BlockDriver bdrv_commit_top = { .format_name = "commit_top", .bdrv_co_preadv = bdrv_commit_top_preadv, - .bdrv_co_get_block_status = bdrv_co_get_block_status_from_backing, + .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, .bdrv_close = bdrv_commit_top_close, .bdrv_child_perm = bdrv_commit_top_child_perm, diff --git a/block/io.c b/block/io.c index b00c7e2e2c0e..5bae79f282e5 100644 --- a/block/io.c +++ b/block/io.c @@ -1868,30 +1868,34 @@ typedef struct BdrvCoBlockStatusData { bool done; } BdrvCoBlockStatusData; -int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - int *pnum, - BlockDriverState **file) +int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { assert(bs->file && bs->file->bs); - *pnum = nb_sectors; + *pnum = bytes; + *map = offset; *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | - (sector_num << BDRV_SECTOR_BITS); + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } -int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - int *pnum, - BlockDriverState **file) +int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { assert(bs->backing && bs->backing->bs); - *pnum = nb_sectors; + *pnum = bytes; + *map = offset; *file = bs->backing->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | - (sector_num << BDRV_SECTOR_BITS); + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } /* diff --git a/block/mirror.c b/block/mirror.c index c9badc1203b5..f5bf620942f5 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1094,7 +1094,7 @@ static BlockDriver bdrv_mirror_top = { .bdrv_co_pwrite_zeroes = bdrv_mirror_top_pwrite_zeroes, .bdrv_co_pdiscard = bdrv_mirror_top_pdiscard, .bdrv_co_flush = bdrv_mirror_top_flush, - .bdrv_co_get_block_status = bdrv_co_get_block_status_from_backing, + .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_mirror_top_refresh_filename, .bdrv_close = bdrv_mirror_top_close, .bdrv_child_perm = bdrv_mirror_top_child_perm, diff --git a/block/throttle.c b/block/throttle.c index 495f88c7521d..5f4d43d0fc57 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -240,7 +240,7 @@ static BlockDriver bdrv_throttle = { .bdrv_reopen_prepare = throttle_reopen_prepare, .bdrv_reopen_commit = throttle_reopen_commit, .bdrv_reopen_abort = throttle_reopen_abort, - .bdrv_co_get_block_status = bdrv_co_get_block_status_from_file, + .bdrv_co_block_status = bdrv_co_block_status_from_file, .bdrv_co_drain_begin = throttle_co_drain_begin, .bdrv_co_drain_end = throttle_co_drain_end, diff --git a/include/block/block_int.h b/include/block/block_int.h index c93722b43a4a..bf2598856cf1 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1041,23 +1041,27 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, uint64_t *nperm, uint64_t *nshared); /* - * Default implementation for drivers to pass bdrv_co_get_block_status() to + * Default implementation for drivers to pass bdrv_co_block_status() to * their file. */ -int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - int *pnum, - BlockDriverState **file); +int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file); /* - * Default implementation for drivers to pass bdrv_co_get_block_status() to + * Default implementation for drivers to pass bdrv_co_block_status() to * their backing file. */ -int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - int *pnum, - BlockDriverState **file); +int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file); const char *bdrv_get_parent_name(const BlockDriverState *bs); void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); bool blk_dev_has_removable_media(BlockBackend *blk); From a290f085901b528265787cd27ebda19c970be4ee Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:44 -0600 Subject: [PATCH 04/38] file-posix: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the file protocol driver accordingly. In want_zero mode, we continue to report fine-grained hole information (the caller wants as much mapping detail as possible); but when not in that mode, the caller prefers larger *pnum and merely cares about what offsets are allocated at this layer, rather than where the holes live. Since holes still read as zeroes at this layer (rather than deferring to a backing layer), we can take the shortcut of skipping lseek(), and merely state that all bytes are allocated. We can also drop redundant bounds checks that are already guaranteed by the block layer. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/file-posix.c | 64 ++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index ca49c1a98ae1..f1591c38490c 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2131,25 +2131,24 @@ static int find_allocation(BlockDriverState *bs, off_t start, } /* - * Returns the allocation status of the specified sectors. + * Returns the allocation status of the specified offset. * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. + * The block layer guarantees 'offset' and 'bytes' are within bounds. * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same + * 'pnum' is set to the number of bytes (including and immediately following + * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. + * 'bytes' is the max value 'pnum' should be set to. */ -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - off_t start, data = 0, hole = 0; - int64_t total_size; +static int coroutine_fn raw_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, + BlockDriverState **file) +{ + off_t data = 0, hole = 0; int ret; ret = fd_open(bs); @@ -2157,39 +2156,36 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, return ret; } - start = sector_num * BDRV_SECTOR_SIZE; - total_size = bdrv_getlength(bs); - if (total_size < 0) { - return total_size; - } else if (start >= total_size) { - *pnum = 0; - return 0; - } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { - nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + if (!want_zero) { + *pnum = bytes; + *map = offset; + *file = bs; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } - ret = find_allocation(bs, start, &data, &hole); + ret = find_allocation(bs, offset, &data, &hole); if (ret == -ENXIO) { /* Trailing hole */ - *pnum = nb_sectors; + *pnum = bytes; ret = BDRV_BLOCK_ZERO; } else if (ret < 0) { /* No info available, so pretend there are no holes */ - *pnum = nb_sectors; + *pnum = bytes; ret = BDRV_BLOCK_DATA; - } else if (data == start) { - /* On a data extent, compute sectors to the end of the extent, + } else if (data == offset) { + /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); + *pnum = MIN(bytes, hole - offset); ret = BDRV_BLOCK_DATA; } else { - /* On a hole, compute sectors to the beginning of the next extent. */ - assert(hole == start); - *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + /* On a hole, compute bytes to the beginning of the next extent. */ + assert(hole == offset); + *pnum = MIN(bytes, data - offset); ret = BDRV_BLOCK_ZERO; } + *map = offset; *file = bs; - return ret | BDRV_BLOCK_OFFSET_VALID | start; + return ret | BDRV_BLOCK_OFFSET_VALID; } static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, @@ -2282,7 +2278,7 @@ BlockDriver bdrv_file = { .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = raw_co_get_block_status, + .bdrv_co_block_status = raw_co_block_status, .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, .bdrv_co_preadv = raw_co_preadv, From 08c9e7735e4340a13596c5b97727655dee1fbb99 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:45 -0600 Subject: [PATCH 05/38] gluster: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the gluster driver accordingly. In want_zero mode, we continue to report fine-grained hole information (the caller wants as much mapping detail as possible); but when not in that mode, the caller prefers larger *pnum and merely cares about what offsets are allocated at this layer, rather than where the holes live. Since holes still read as zeroes at this layer (rather than deferring to a backing layer), we can take the shortcut of skipping find_allocation(), and merely state that all bytes are allocated. We can also drop redundant bounds checks that are already guaranteed by the block layer. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/gluster.c | 70 ++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/block/gluster.c b/block/gluster.c index 3f17b7819d23..1a07d221d173 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1362,68 +1362,66 @@ static int find_allocation(BlockDriverState *bs, off_t start, } /* - * Returns the allocation status of the specified sectors. + * Returns the allocation status of the specified offset. * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. + * The block layer guarantees 'offset' and 'bytes' are within bounds. * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same + * 'pnum' is set to the number of bytes (including and immediately following + * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. + * 'bytes' is the max value 'pnum' should be set to. * - * (Based on raw_co_get_block_status() from file-posix.c.) + * (Based on raw_co_block_status() from file-posix.c.) */ -static int64_t coroutine_fn qemu_gluster_co_get_block_status( - BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, - BlockDriverState **file) +static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { BDRVGlusterState *s = bs->opaque; - off_t start, data = 0, hole = 0; - int64_t total_size; + off_t data = 0, hole = 0; int ret = -EINVAL; if (!s->fd) { return ret; } - start = sector_num * BDRV_SECTOR_SIZE; - total_size = bdrv_getlength(bs); - if (total_size < 0) { - return total_size; - } else if (start >= total_size) { - *pnum = 0; - return 0; - } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { - nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + if (!want_zero) { + *pnum = bytes; + *map = offset; + *file = bs; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } - ret = find_allocation(bs, start, &data, &hole); + ret = find_allocation(bs, offset, &data, &hole); if (ret == -ENXIO) { /* Trailing hole */ - *pnum = nb_sectors; + *pnum = bytes; ret = BDRV_BLOCK_ZERO; } else if (ret < 0) { /* No info available, so pretend there are no holes */ - *pnum = nb_sectors; + *pnum = bytes; ret = BDRV_BLOCK_DATA; - } else if (data == start) { - /* On a data extent, compute sectors to the end of the extent, + } else if (data == offset) { + /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); + *pnum = MIN(bytes, hole - offset); ret = BDRV_BLOCK_DATA; } else { - /* On a hole, compute sectors to the beginning of the next extent. */ - assert(hole == start); - *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + /* On a hole, compute bytes to the beginning of the next extent. */ + assert(hole == offset); + *pnum = MIN(bytes, data - offset); ret = BDRV_BLOCK_ZERO; } + *map = offset; *file = bs; - return ret | BDRV_BLOCK_OFFSET_VALID | start; + return ret | BDRV_BLOCK_OFFSET_VALID; } @@ -1451,7 +1449,7 @@ static BlockDriver bdrv_gluster = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif - .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, + .bdrv_co_block_status = qemu_gluster_co_block_status, .create_opts = &qemu_gluster_create_opts, }; @@ -1479,7 +1477,7 @@ static BlockDriver bdrv_gluster_tcp = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif - .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, + .bdrv_co_block_status = qemu_gluster_co_block_status, .create_opts = &qemu_gluster_create_opts, }; @@ -1507,7 +1505,7 @@ static BlockDriver bdrv_gluster_unix = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif - .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, + .bdrv_co_block_status = qemu_gluster_co_block_status, .create_opts = &qemu_gluster_create_opts, }; @@ -1541,7 +1539,7 @@ static BlockDriver bdrv_gluster_rdma = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif - .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, + .bdrv_co_block_status = qemu_gluster_co_block_status, .create_opts = &qemu_gluster_create_opts, }; From ba059e7b1785fbb7b3a60d4706f519b1cffab1e3 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:46 -0600 Subject: [PATCH 06/38] iscsi: Switch cluster_sectors to byte-based We are gradually converting to byte-based interfaces, as they are easier to reason about than sector-based. Convert all uses of the cluster size in sectors, along with adding assertions that we are not dividing by zero. Improve some comment grammar while in the area. Signed-off-by: Eric Blake Acked-by: Paolo Bonzini Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/iscsi.c | 56 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 421983dd6ff6..3414c21c7f5a 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -86,7 +86,7 @@ typedef struct IscsiLun { unsigned long *allocmap; unsigned long *allocmap_valid; long allocmap_size; - int cluster_sectors; + int cluster_size; bool use_16_for_rw; bool write_protected; bool lbpme; @@ -430,9 +430,10 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags) { iscsi_allocmap_free(iscsilun); + assert(iscsilun->cluster_size); iscsilun->allocmap_size = - DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun), - iscsilun->cluster_sectors); + DIV_ROUND_UP(iscsilun->num_blocks * iscsilun->block_size, + iscsilun->cluster_size); iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size); if (!iscsilun->allocmap) { @@ -440,7 +441,7 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags) } if (open_flags & BDRV_O_NOCACHE) { - /* in case that cache.direct = on all allocmap entries are + /* when cache.direct = on all allocmap entries are * treated as invalid to force a relookup of the block * status on every read request */ return 0; @@ -461,17 +462,19 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors, bool allocated, bool valid) { int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk; + int cluster_sectors = iscsilun->cluster_size >> BDRV_SECTOR_BITS; if (iscsilun->allocmap == NULL) { return; } /* expand to entirely contain all affected clusters */ - cl_num_expanded = sector_num / iscsilun->cluster_sectors; + assert(cluster_sectors); + cl_num_expanded = sector_num / cluster_sectors; nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors, - iscsilun->cluster_sectors) - cl_num_expanded; + cluster_sectors) - cl_num_expanded; /* shrink to touch only completely contained clusters */ - cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); - nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors + cl_num_shrunk = DIV_ROUND_UP(sector_num, cluster_sectors); + nb_cls_shrunk = (sector_num + nb_sectors) / cluster_sectors - cl_num_shrunk; if (allocated) { bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded); @@ -535,9 +538,12 @@ iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, if (iscsilun->allocmap == NULL) { return true; } - size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + assert(iscsilun->cluster_size); + size = DIV_ROUND_UP(sector_num + nb_sectors, + iscsilun->cluster_size >> BDRV_SECTOR_BITS); return !(find_next_bit(iscsilun->allocmap, size, - sector_num / iscsilun->cluster_sectors) == size); + sector_num * BDRV_SECTOR_SIZE / + iscsilun->cluster_size) == size); } static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, @@ -547,9 +553,12 @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, if (iscsilun->allocmap_valid == NULL) { return false; } - size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + assert(iscsilun->cluster_size); + size = DIV_ROUND_UP(sector_num + nb_sectors, + iscsilun->cluster_size >> BDRV_SECTOR_BITS); return (find_next_zero_bit(iscsilun->allocmap_valid, size, - sector_num / iscsilun->cluster_sectors) == size); + sector_num * BDRV_SECTOR_SIZE / + iscsilun->cluster_size) == size); } static int coroutine_fn @@ -793,16 +802,21 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, BlockDriverState *file; /* check the block status from the beginning of the cluster * containing the start sector */ - int64_t ret = iscsi_co_get_block_status(bs, - sector_num - sector_num % iscsilun->cluster_sectors, - BDRV_REQUEST_MAX_SECTORS, &pnum, &file); + int cluster_sectors = iscsilun->cluster_size >> BDRV_SECTOR_BITS; + int head; + int64_t ret; + + assert(cluster_sectors); + head = sector_num % cluster_sectors; + ret = iscsi_co_get_block_status(bs, sector_num - head, + BDRV_REQUEST_MAX_SECTORS, &pnum, + &file); if (ret < 0) { return ret; } /* if the whole request falls into an unallocated area we can avoid - * to read and directly return zeroes instead */ - if (ret & BDRV_BLOCK_ZERO && - pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) { + * reading and directly return zeroes instead */ + if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors + head) { qemu_iovec_memset(iov, 0, 0x00, iov->size); return 0; } @@ -1953,8 +1967,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, * reasonable size */ if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { - iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * - iscsilun->block_size) >> BDRV_SECTOR_BITS; + iscsilun->cluster_size = iscsilun->bl.opt_unmap_gran * + iscsilun->block_size; if (iscsilun->lbprz) { ret = iscsi_allocmap_init(iscsilun, bs->open_flags); } @@ -2163,7 +2177,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { IscsiLun *iscsilun = bs->opaque; bdi->unallocated_blocks_are_zero = iscsilun->lbprz; - bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; + bdi->cluster_size = iscsilun->cluster_size; return 0; } From 04a408fbffa634bf84cb8f23e4f30c4eb8cb4b05 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:47 -0600 Subject: [PATCH 07/38] iscsi: Switch iscsi_allocmap_update() to byte-based We are gradually converting to byte-based interfaces, as they are easier to reason about than sector-based. Convert all uses of the allocmap (no semantic change). Callers that already had bytes available are simpler, and callers that now scale to bytes will be easier to switch to byte-based in the future. Signed-off-by: Eric Blake Acked-by: Paolo Bonzini Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/iscsi.c | 90 +++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 3414c21c7f5a..d2b0466775c4 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -458,24 +458,22 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags) } static void -iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors, bool allocated, bool valid) +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t offset, + int64_t bytes, bool allocated, bool valid) { int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk; - int cluster_sectors = iscsilun->cluster_size >> BDRV_SECTOR_BITS; if (iscsilun->allocmap == NULL) { return; } /* expand to entirely contain all affected clusters */ - assert(cluster_sectors); - cl_num_expanded = sector_num / cluster_sectors; - nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors, - cluster_sectors) - cl_num_expanded; + assert(iscsilun->cluster_size); + cl_num_expanded = offset / iscsilun->cluster_size; + nb_cls_expanded = DIV_ROUND_UP(offset + bytes, + iscsilun->cluster_size) - cl_num_expanded; /* shrink to touch only completely contained clusters */ - cl_num_shrunk = DIV_ROUND_UP(sector_num, cluster_sectors); - nb_cls_shrunk = (sector_num + nb_sectors) / cluster_sectors - - cl_num_shrunk; + cl_num_shrunk = DIV_ROUND_UP(offset, iscsilun->cluster_size); + nb_cls_shrunk = (offset + bytes) / iscsilun->cluster_size - cl_num_shrunk; if (allocated) { bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded); } else { @@ -498,26 +496,26 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num, } static void -iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t offset, + int64_t bytes) { - iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true); + iscsi_allocmap_update(iscsilun, offset, bytes, true, true); } static void -iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t offset, + int64_t bytes) { /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update * is ignored, so this will in effect be an iscsi_allocmap_set_invalid. */ - iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true); + iscsi_allocmap_update(iscsilun, offset, bytes, false, true); } -static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t offset, + int64_t bytes) { - iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false); + iscsi_allocmap_update(iscsilun, offset, bytes, false, false); } static void iscsi_allocmap_invalidate(IscsiLun *iscsilun) @@ -531,34 +529,30 @@ static void iscsi_allocmap_invalidate(IscsiLun *iscsilun) } static inline bool -iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) +iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t offset, + int64_t bytes) { unsigned long size; if (iscsilun->allocmap == NULL) { return true; } assert(iscsilun->cluster_size); - size = DIV_ROUND_UP(sector_num + nb_sectors, - iscsilun->cluster_size >> BDRV_SECTOR_BITS); + size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size); return !(find_next_bit(iscsilun->allocmap, size, - sector_num * BDRV_SECTOR_SIZE / - iscsilun->cluster_size) == size); + offset / iscsilun->cluster_size) == size); } static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, - int64_t sector_num, int nb_sectors) + int64_t offset, int64_t bytes) { unsigned long size; if (iscsilun->allocmap_valid == NULL) { return false; } assert(iscsilun->cluster_size); - size = DIV_ROUND_UP(sector_num + nb_sectors, - iscsilun->cluster_size >> BDRV_SECTOR_BITS); + size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size); return (find_next_zero_bit(iscsilun->allocmap_valid, size, - sector_num * BDRV_SECTOR_SIZE / - iscsilun->cluster_size) == size); + offset / iscsilun->cluster_size) == size); } static int coroutine_fn @@ -640,14 +634,16 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } if (iTask.status != SCSI_STATUS_GOOD) { - iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_invalid(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); error_report("iSCSI WRITE10/16 failed at lba %" PRIu64 ": %s", lba, iTask.err_str); r = iTask.err_code; goto out_unlock; } - iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); out_unlock: qemu_mutex_unlock(&iscsilun->mutex); @@ -747,9 +743,11 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, } if (ret & BDRV_BLOCK_ZERO) { - iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum); + iscsi_allocmap_set_unallocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, + *pnum * BDRV_SECTOR_SIZE); } else { - iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum); + iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, + *pnum * BDRV_SECTOR_SIZE); } if (*pnum > nb_sectors) { @@ -789,15 +787,19 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, /* if cache.direct is off and we have a valid entry in our allocation map * we can skip checking the block status and directly return zeroes if * the request falls within an unallocated area */ - if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) && - !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + if (iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE) && + !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE)) { qemu_iovec_memset(iov, 0, 0x00, iov->size); return 0; } if (nb_sectors >= ISCSI_CHECKALLOC_THRES && - !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) && - !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + !iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE) && + !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE)) { int pnum; BlockDriverState *file; /* check the block status from the beginning of the cluster @@ -1160,8 +1162,7 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) goto retry; } - iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, - bytes >> BDRV_SECTOR_BITS); + iscsi_allocmap_set_invalid(iscsilun, offset, bytes); if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { /* the target might fail with a check condition if it @@ -1274,8 +1275,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, } if (iTask.status != SCSI_STATUS_GOOD) { - iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, - bytes >> BDRV_SECTOR_BITS); + iscsi_allocmap_set_invalid(iscsilun, offset, bytes); error_report("iSCSI WRITESAME10/16 failed at lba %" PRIu64 ": %s", lba, iTask.err_str); r = iTask.err_code; @@ -1283,11 +1283,9 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, } if (flags & BDRV_REQ_MAY_UNMAP) { - iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, - bytes >> BDRV_SECTOR_BITS); + iscsi_allocmap_set_invalid(iscsilun, offset, bytes); } else { - iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS, - bytes >> BDRV_SECTOR_BITS); + iscsi_allocmap_set_allocated(iscsilun, offset, bytes); } out_unlock: From 92809c36009de25d4f41e88f572bca3cb26b7387 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:48 -0600 Subject: [PATCH 08/38] iscsi: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the iscsi driver accordingly. In this case, it is handy to teach iscsi_co_block_status() to handle a NULL map and file parameter, even though the block layer passes non-NULL values, because we also call the function directly. For now, there are no optimizations done based on the want_zero flag. We can also make the simplification of asserting that the block layer passed in aligned values. Signed-off-by: Eric Blake Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/iscsi.c | 69 ++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index d2b0466775c4..c228ca21c8fe 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -653,36 +653,36 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) +static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, + BlockDriverState **file) { IscsiLun *iscsilun = bs->opaque; struct scsi_get_lba_status *lbas = NULL; struct scsi_lba_status_descriptor *lbasd = NULL; struct IscsiTask iTask; uint64_t lba; - int64_t ret; + int ret; iscsi_co_init_iscsitask(iscsilun, &iTask); - if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - ret = -EINVAL; - goto out; - } + assert(QEMU_IS_ALIGNED(offset | bytes, iscsilun->block_size)); /* default to all sectors allocated */ - ret = BDRV_BLOCK_DATA; - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; - *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + if (map) { + *map = offset; + } + *pnum = bytes; /* LUN does not support logical block provisioning */ if (!iscsilun->lbpme) { goto out; } - lba = sector_qemu2lun(sector_num, iscsilun); + lba = offset / iscsilun->block_size; qemu_mutex_lock(&iscsilun->mutex); retry: @@ -727,12 +727,12 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, lbasd = &lbas->descriptors[0]; - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + if (lba != lbasd->lba) { ret = -EIO; goto out_unlock; } - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + *pnum = lbasd->num_blocks * iscsilun->block_size; if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { @@ -743,15 +743,13 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, } if (ret & BDRV_BLOCK_ZERO) { - iscsi_allocmap_set_unallocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, - *pnum * BDRV_SECTOR_SIZE); + iscsi_allocmap_set_unallocated(iscsilun, offset, *pnum); } else { - iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, - *pnum * BDRV_SECTOR_SIZE); + iscsi_allocmap_set_allocated(iscsilun, offset, *pnum); } - if (*pnum > nb_sectors) { - *pnum = nb_sectors; + if (*pnum > bytes) { + *pnum = bytes; } out_unlock: qemu_mutex_unlock(&iscsilun->mutex); @@ -760,7 +758,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); } - if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { + if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID && file) { *file = bs; } return ret; @@ -800,25 +798,24 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, nb_sectors * BDRV_SECTOR_SIZE) && !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE)) { - int pnum; - BlockDriverState *file; + int64_t pnum; /* check the block status from the beginning of the cluster * containing the start sector */ - int cluster_sectors = iscsilun->cluster_size >> BDRV_SECTOR_BITS; - int head; - int64_t ret; - - assert(cluster_sectors); - head = sector_num % cluster_sectors; - ret = iscsi_co_get_block_status(bs, sector_num - head, - BDRV_REQUEST_MAX_SECTORS, &pnum, - &file); + int64_t head; + int ret; + + assert(iscsilun->cluster_size); + head = (sector_num * BDRV_SECTOR_SIZE) % iscsilun->cluster_size; + ret = iscsi_co_block_status(bs, true, + sector_num * BDRV_SECTOR_SIZE - head, + BDRV_REQUEST_MAX_BYTES, &pnum, NULL, NULL); if (ret < 0) { return ret; } /* if the whole request falls into an unallocated area we can avoid * reading and directly return zeroes instead */ - if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors + head) { + if (ret & BDRV_BLOCK_ZERO && + pnum >= nb_sectors * BDRV_SECTOR_SIZE + head) { qemu_iovec_memset(iov, 0, 0x00, iov->size); return 0; } @@ -2218,7 +2215,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_truncate = iscsi_truncate, .bdrv_refresh_limits = iscsi_refresh_limits, - .bdrv_co_get_block_status = iscsi_co_get_block_status, + .bdrv_co_block_status = iscsi_co_block_status, .bdrv_co_pdiscard = iscsi_co_pdiscard, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, @@ -2253,7 +2250,7 @@ static BlockDriver bdrv_iser = { .bdrv_truncate = iscsi_truncate, .bdrv_refresh_limits = iscsi_refresh_limits, - .bdrv_co_get_block_status = iscsi_co_get_block_status, + .bdrv_co_block_status = iscsi_co_block_status, .bdrv_co_pdiscard = iscsi_co_pdiscard, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, From 05c33f1021f95bc18af153ea5669be57bba527e5 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:49 -0600 Subject: [PATCH 09/38] null: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the null driver accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/null.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/block/null.c b/block/null.c index 214d394fff42..806a8631e4d2 100644 --- a/block/null.c +++ b/block/null.c @@ -223,22 +223,23 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state, return 0; } -static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) +static int coroutine_fn null_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, + BlockDriverState **file) { BDRVNullState *s = bs->opaque; - off_t start = sector_num * BDRV_SECTOR_SIZE; + int ret = BDRV_BLOCK_OFFSET_VALID; - *pnum = nb_sectors; + *pnum = bytes; + *map = offset; *file = bs; if (s->read_zeroes) { - return BDRV_BLOCK_OFFSET_VALID | start | BDRV_BLOCK_ZERO; - } else { - return BDRV_BLOCK_OFFSET_VALID | start; + ret |= BDRV_BLOCK_ZERO; } + return ret; } static void null_refresh_filename(BlockDriverState *bs, QDict *opts) @@ -270,7 +271,7 @@ static BlockDriver bdrv_null_co = { .bdrv_co_flush_to_disk = null_co_flush, .bdrv_reopen_prepare = null_reopen_prepare, - .bdrv_co_get_block_status = null_co_get_block_status, + .bdrv_co_block_status = null_co_block_status, .bdrv_refresh_filename = null_refresh_filename, }; @@ -290,7 +291,7 @@ static BlockDriver bdrv_null_aio = { .bdrv_aio_flush = null_aio_flush, .bdrv_reopen_prepare = null_reopen_prepare, - .bdrv_co_get_block_status = null_co_get_block_status, + .bdrv_co_block_status = null_co_block_status, .bdrv_refresh_filename = null_refresh_filename, }; From 8e0cf59d02d2e7b91e785214fbe9a15d8a6ce20b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:50 -0600 Subject: [PATCH 10/38] parallels: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the parallels driver accordingly. Note that the internal function block_status() is still sector-based, because it is still in use by other sector-based functions; but that's okay because request_alignment is 512 as a result of those functions. For now, no optimizations are added based on the mapping hint. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/parallels.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index e1e3d80c8875..3e952a9c147a 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -261,23 +261,31 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) } -static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn parallels_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { BDRVParallelsState *s = bs->opaque; - int64_t offset; + int count; + assert(QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)); qemu_co_mutex_lock(&s->lock); - offset = block_status(s, sector_num, nb_sectors, pnum); + offset = block_status(s, offset >> BDRV_SECTOR_BITS, + bytes >> BDRV_SECTOR_BITS, &count); qemu_co_mutex_unlock(&s->lock); + *pnum = count * BDRV_SECTOR_SIZE; if (offset < 0) { return 0; } + *map = offset * BDRV_SECTOR_SIZE; *file = bs->file->bs; - return (offset << BDRV_SECTOR_BITS) | - BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } static coroutine_fn int parallels_co_writev(BlockDriverState *bs, @@ -782,7 +790,7 @@ static BlockDriver bdrv_parallels = { .bdrv_open = parallels_open, .bdrv_close = parallels_close, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_co_get_block_status = parallels_co_get_block_status, + .bdrv_co_block_status = parallels_co_block_status, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_flush_to_os = parallels_co_flush_to_os, .bdrv_co_readv = parallels_co_readv, From d63b4c93e30915def5afabdb48134f5238ee793d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:51 -0600 Subject: [PATCH 11/38] qcow: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the qcow driver accordingly. There is no intent to optimize based on the want_zero flag for this format. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/qcow.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 8631155ac812..dead5029c67d 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -524,23 +524,28 @@ static int get_cluster_offset(BlockDriverState *bs, return 1; } -static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn qcow_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { BDRVQcowState *s = bs->opaque; - int index_in_cluster, n, ret; + int index_in_cluster, ret; + int64_t n; uint64_t cluster_offset; qemu_co_mutex_lock(&s->lock); - ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset); + ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { return ret; } - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; + index_in_cluster = offset & (s->cluster_size - 1); + n = s->cluster_size - index_in_cluster; + if (n > bytes) { + n = bytes; + } *pnum = n; if (!cluster_offset) { return 0; @@ -548,9 +553,9 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) { return BDRV_BLOCK_DATA; } - cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + *map = cluster_offset | index_in_cluster; *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } static int decompress_buffer(uint8_t *out_buf, int out_buf_size, @@ -1128,7 +1133,7 @@ static BlockDriver bdrv_qcow = { .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, - .bdrv_co_get_block_status = qcow_co_get_block_status, + .bdrv_co_block_status = qcow_co_block_status, .bdrv_make_empty = qcow_make_empty, .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed, From a320fb04b66a2189b6cbd7924aa7bfaaded79afe Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:52 -0600 Subject: [PATCH 12/38] qcow2: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the qcow2 driver accordingly. For now, we are ignoring the 'want_zero' hint. However, it should be relatively straightforward to honor the hint as a way to return larger *pnum values when we have consecutive clusters with the same data/zero status but which differ only in having non-consecutive mappings. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/qcow2.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 57a517e2bdd8..288b5299d800 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1670,32 +1670,34 @@ static void qcow2_join_options(QDict *options, QDict *old_options) } } -static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { BDRVQcow2State *s = bs->opaque; uint64_t cluster_offset; int index_in_cluster, ret; unsigned int bytes; - int64_t status = 0; + int status = 0; - bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE); + bytes = MIN(INT_MAX, count); qemu_co_mutex_lock(&s->lock); - ret = qcow2_get_cluster_offset(bs, sector_num << BDRV_SECTOR_BITS, &bytes, - &cluster_offset); + ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { return ret; } - *pnum = bytes >> BDRV_SECTOR_BITS; + *pnum = bytes; if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && !s->crypto) { - index_in_cluster = sector_num & (s->cluster_sectors - 1); - cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + index_in_cluster = offset & (s->cluster_size - 1); + *map = cluster_offset | index_in_cluster; *file = bs->file->bs; - status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; + status |= BDRV_BLOCK_OFFSET_VALID; } if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) { status |= BDRV_BLOCK_ZERO; @@ -4352,7 +4354,7 @@ BlockDriver bdrv_qcow2 = { .bdrv_child_perm = bdrv_format_default_perms, .bdrv_create = qcow2_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = qcow2_co_get_block_status, + .bdrv_co_block_status = qcow2_co_block_status, .bdrv_co_preadv = qcow2_co_preadv, .bdrv_co_pwritev = qcow2_co_pwritev, From b8d739fd6fb295a7562c506c65f2137199a509f9 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:53 -0600 Subject: [PATCH 13/38] qed: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the qed driver accordingly, taking the opportunity to inline qed_is_allocated_cb() into its lone caller (the callback used to be important, until we switched qed to coroutines). There is no intent to optimize based on the want_zero flag for this format. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/qed.c | 76 +++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/block/qed.c b/block/qed.c index c6ff3ab015d2..a5952209261a 100644 --- a/block/qed.c +++ b/block/qed.c @@ -688,74 +688,46 @@ static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) return ret; } -typedef struct { - BlockDriverState *bs; - Coroutine *co; - uint64_t pos; - int64_t status; - int *pnum; - BlockDriverState **file; -} QEDIsAllocatedCB; - -/* Called with table_lock held. */ -static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t pos, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { - QEDIsAllocatedCB *cb = opaque; - BDRVQEDState *s = cb->bs->opaque; - *cb->pnum = len / BDRV_SECTOR_SIZE; + BDRVQEDState *s = bs->opaque; + size_t len = MIN(bytes, SIZE_MAX); + int status; + QEDRequest request = { .l2_table = NULL }; + uint64_t offset; + int ret; + + qemu_co_mutex_lock(&s->table_lock); + ret = qed_find_cluster(s, &request, pos, &len, &offset); + + *pnum = len; switch (ret) { case QED_CLUSTER_FOUND: - offset |= qed_offset_into_cluster(s, cb->pos); - cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; - *cb->file = cb->bs->file->bs; + *map = offset | qed_offset_into_cluster(s, pos); + status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + *file = bs->file->bs; break; case QED_CLUSTER_ZERO: - cb->status = BDRV_BLOCK_ZERO; + status = BDRV_BLOCK_ZERO; break; case QED_CLUSTER_L2: case QED_CLUSTER_L1: - cb->status = 0; + status = 0; break; default: assert(ret < 0); - cb->status = ret; + status = ret; break; } - if (cb->co) { - aio_co_wake(cb->co); - } -} - -static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - BDRVQEDState *s = bs->opaque; - size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; - QEDIsAllocatedCB cb = { - .bs = bs, - .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, - .status = BDRV_BLOCK_OFFSET_MASK, - .pnum = pnum, - .file = file, - }; - QEDRequest request = { .l2_table = NULL }; - uint64_t offset; - int ret; - - qemu_co_mutex_lock(&s->table_lock); - ret = qed_find_cluster(s, &request, cb.pos, &len, &offset); - qed_is_allocated_cb(&cb, ret, offset, len); - - /* The callback was invoked immediately */ - assert(cb.status != BDRV_BLOCK_OFFSET_MASK); - qed_unref_l2_cache_entry(request.l2_table); qemu_co_mutex_unlock(&s->table_lock); - return cb.status; + return status; } static BDRVQEDState *acb_to_s(QEDAIOCB *acb) @@ -1594,7 +1566,7 @@ static BlockDriver bdrv_qed = { .bdrv_child_perm = bdrv_format_default_perms, .bdrv_create = bdrv_qed_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, + .bdrv_co_block_status = bdrv_qed_co_block_status, .bdrv_co_readv = bdrv_qed_co_readv, .bdrv_co_writev = bdrv_qed_co_writev, .bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes, From d41aa7e36f442cba9c3841f87856454fca96288e Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:54 -0600 Subject: [PATCH 14/38] raw: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the raw driver accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/raw-format.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/raw-format.c b/block/raw-format.c index ab552c095416..830243a8e486 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -250,17 +250,17 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, return ret; } -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, +static int coroutine_fn raw_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file) { BDRVRawState *s = bs->opaque; - *pnum = nb_sectors; + *pnum = bytes; *file = bs->file->bs; - sector_num += s->offset / BDRV_SECTOR_SIZE; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | - (sector_num << BDRV_SECTOR_BITS); + *map = offset + s->offset; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, @@ -496,7 +496,7 @@ BlockDriver bdrv_raw = { .bdrv_co_pwritev = &raw_co_pwritev, .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, .bdrv_co_pdiscard = &raw_co_pdiscard, - .bdrv_co_get_block_status = &raw_co_get_block_status, + .bdrv_co_block_status = &raw_co_block_status, .bdrv_truncate = &raw_truncate, .bdrv_getlength = &raw_getlength, .has_variable_length = true, From 47943e986539e7e78ce4010f7d721408235ad058 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:55 -0600 Subject: [PATCH 15/38] sheepdog: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the sheepdog driver accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Reviewed-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/sheepdog.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index ac02b10fe03d..3c3becf94df0 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -3004,19 +3004,19 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset, return acb.ret; } -static coroutine_fn int64_t -sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum, BlockDriverState **file) +static coroutine_fn int +sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, int64_t *map, + BlockDriverState **file) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); - uint64_t offset = sector_num * BDRV_SECTOR_SIZE; unsigned long start = offset / object_size, - end = DIV_ROUND_UP((sector_num + nb_sectors) * - BDRV_SECTOR_SIZE, object_size); + end = DIV_ROUND_UP(offset + bytes, object_size); unsigned long idx; - int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + *map = offset; + int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; for (idx = start; idx < end; idx++) { if (inode->data_vdi_id[idx] == 0) { @@ -3033,9 +3033,9 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } } - *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; - if (*pnum > nb_sectors) { - *pnum = nb_sectors; + *pnum = (idx - start) * object_size; + if (*pnum > bytes) { + *pnum = bytes; } if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { *file = bs; @@ -3113,7 +3113,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_pdiscard = sd_co_pdiscard, - .bdrv_co_get_block_status = sd_co_get_block_status, + .bdrv_co_block_status = sd_co_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -3149,7 +3149,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_pdiscard = sd_co_pdiscard, - .bdrv_co_get_block_status = sd_co_get_block_status, + .bdrv_co_block_status = sd_co_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -3185,7 +3185,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_pdiscard = sd_co_pdiscard, - .bdrv_co_get_block_status = sd_co_get_block_status, + .bdrv_co_block_status = sd_co_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, From b80666bf844bac5ae775cb67029a3cd67ad7c730 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:56 -0600 Subject: [PATCH 16/38] vdi: Avoid bitrot of debugging code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework the debug define so that we always get -Wformat checking, even when debugging is disabled. Signed-off-by: Eric Blake Reviewed-by: Stefan Weil Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vdi.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index fc1c614cb122..32b1763cde08 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -87,12 +87,18 @@ #define DEFAULT_CLUSTER_SIZE (1 * MiB) #if defined(CONFIG_VDI_DEBUG) -#define logout(fmt, ...) \ - fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__) +#define VDI_DEBUG 1 #else -#define logout(fmt, ...) ((void)0) +#define VDI_DEBUG 0 #endif +#define logout(fmt, ...) \ + do { \ + if (VDI_DEBUG) { \ + fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__); \ + } \ + } while (0) + /* Image signature. */ #define VDI_SIGNATURE 0xbeda107f From 67635f6abebac7f49d0067b3e60626c857cefa06 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:57 -0600 Subject: [PATCH 17/38] vdi: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the vdi driver accordingly. Note that the TODO is already covered (the block layer guarantees bounds of its requests), and that we can remove the now-unused s->block_sectors. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vdi.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index 32b1763cde08..0780c82d8291 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -172,8 +172,6 @@ typedef struct { uint32_t *bmap; /* Size of block (bytes). */ uint32_t block_size; - /* Size of block (sectors). */ - uint32_t block_sectors; /* First sector of block map. */ uint32_t bmap_sector; /* VDI header (converted to host endianness). */ @@ -463,7 +461,6 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = header.disk_size / SECTOR_SIZE; s->block_size = header.block_size; - s->block_sectors = header.block_size / SECTOR_SIZE; s->bmap_sector = header.offset_bmap / SECTOR_SIZE; s->header = header; @@ -509,33 +506,29 @@ static int vdi_reopen_prepare(BDRVReopenState *state, return 0; } -static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn vdi_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { - /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ BDRVVdiState *s = (BDRVVdiState *)bs->opaque; - size_t bmap_index = sector_num / s->block_sectors; - size_t sector_in_block = sector_num % s->block_sectors; - int n_sectors = s->block_sectors - sector_in_block; + size_t bmap_index = offset / s->block_size; + size_t index_in_block = offset % s->block_size; uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); - uint64_t offset; int result; - logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } - *pnum = n_sectors; + logout("%p, %" PRId64 ", %" PRId64 ", %p\n", bs, offset, bytes, pnum); + *pnum = MIN(s->block_size - index_in_block, bytes); result = VDI_IS_ALLOCATED(bmap_entry); if (!result) { return 0; } - offset = s->header.offset_data + - (uint64_t)bmap_entry * s->block_size + - sector_in_block * SECTOR_SIZE; + *map = s->header.offset_data + (uint64_t)bmap_entry * s->block_size + + index_in_block; *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } static int coroutine_fn @@ -903,7 +896,7 @@ static BlockDriver bdrv_vdi = { .bdrv_child_perm = bdrv_format_default_perms, .bdrv_create = vdi_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = vdi_co_get_block_status, + .bdrv_co_block_status = vdi_co_block_status, .bdrv_make_empty = vdi_make_empty, .bdrv_co_preadv = vdi_co_preadv, From c72080b9b8eab200551c26467511ebf599754f9e Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:58 -0600 Subject: [PATCH 18/38] vmdk: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the vmdk driver accordingly. Drop the now-unused vmdk_find_index_in_cluster(). Also, fix a pre-existing bug: if find_extent() fails (unlikely, since the block layer did a bounds check), then we must return a failure, rather than 0. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vmdk.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index ef15ddbfd3d5..75f84213e6f6 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1304,33 +1304,27 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, return extent_relative_offset % cluster_size; } -static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, - int64_t sector_num) -{ - uint64_t offset; - offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE); - return offset / BDRV_SECTOR_SIZE; -} - -static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { BDRVVmdkState *s = bs->opaque; int64_t index_in_cluster, n, ret; - uint64_t offset; + uint64_t cluster_offset; VmdkExtent *extent; - extent = find_extent(s, sector_num, NULL); + extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL); if (!extent) { - return 0; + return -EIO; } qemu_co_mutex_lock(&s->lock); - ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, + ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset, 0, 0); qemu_co_mutex_unlock(&s->lock); - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); + index_in_cluster = vmdk_find_offset_in_cluster(extent, offset); switch (ret) { case VMDK_ERROR: ret = -EIO; @@ -1345,18 +1339,14 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, ret = BDRV_BLOCK_DATA; if (!extent->compressed) { ret |= BDRV_BLOCK_OFFSET_VALID; - ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS)) - & BDRV_BLOCK_OFFSET_MASK; + *map = cluster_offset + index_in_cluster; } *file = extent->file->bs; break; } - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - *pnum = n; + n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster; + *pnum = MIN(n, bytes); return ret; } @@ -2410,7 +2400,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_co_get_block_status = vmdk_co_get_block_status, + .bdrv_co_block_status = vmdk_co_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, .bdrv_get_specific_info = vmdk_get_specific_info, From 2f83673b57ba6b408222d5a18b48d34eb35a54b3 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:26:59 -0600 Subject: [PATCH 19/38] vpc: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the vpc driver accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vpc.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index cfa5144e8678..fba4492fd7b0 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -706,53 +706,54 @@ vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return ret; } -static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +static int coroutine_fn vpc_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) { BDRVVPCState *s = bs->opaque; VHDFooter *footer = (VHDFooter*) s->footer_buf; - int64_t start, offset; + int64_t image_offset; bool allocated; - int64_t ret; - int n; + int ret; + int64_t n; if (be32_to_cpu(footer->type) == VHD_FIXED) { - *pnum = nb_sectors; + *pnum = bytes; + *map = offset; *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | - (sector_num << BDRV_SECTOR_BITS); + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } qemu_co_mutex_lock(&s->lock); - offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL); - start = offset; - allocated = (offset != -1); + image_offset = get_image_offset(bs, offset, false, NULL); + allocated = (image_offset != -1); *pnum = 0; ret = 0; do { /* All sectors in a block are contiguous (without using the bitmap) */ - n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE) - - sector_num; - n = MIN(n, nb_sectors); + n = ROUND_UP(offset + 1, s->block_size) - offset; + n = MIN(n, bytes); *pnum += n; - sector_num += n; - nb_sectors -= n; + offset += n; + bytes -= n; /* *pnum can't be greater than one block for allocated * sectors since there is always a bitmap in between. */ if (allocated) { *file = bs->file->bs; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + *map = image_offset; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; break; } - if (nb_sectors == 0) { + if (bytes == 0) { break; } - offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, - NULL); - } while (offset == -1); + image_offset = get_image_offset(bs, offset, false, NULL); + } while (image_offset == -1); qemu_co_mutex_unlock(&s->lock); return ret; @@ -1098,7 +1099,7 @@ static BlockDriver bdrv_vpc = { .bdrv_co_preadv = vpc_co_preadv, .bdrv_co_pwritev = vpc_co_pwritev, - .bdrv_co_get_block_status = vpc_co_get_block_status, + .bdrv_co_block_status = vpc_co_block_status, .bdrv_get_info = vpc_get_info, From fba3998dae0b56f30aff3f3ad3a5cbc3502af0c6 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:27:00 -0600 Subject: [PATCH 20/38] vvfat: Switch to .bdrv_co_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Update the vvfat driver accordingly. Note that we can rely on the block driver having already clamped limits to our block size, and simplify accordingly. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vvfat.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index 7e06ebacf617..4a17a49e1282 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -3088,15 +3088,13 @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return ret; } -static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *n, BlockDriverState **file) +static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *n, + int64_t *map, + BlockDriverState **file) { - *n = bs->total_sectors - sector_num; - if (*n > nb_sectors) { - *n = nb_sectors; - } else if (*n < 0) { - return 0; - } + *n = bytes; return BDRV_BLOCK_DATA; } @@ -3257,7 +3255,7 @@ static BlockDriver bdrv_vvfat = { .bdrv_co_preadv = vvfat_co_preadv, .bdrv_co_pwritev = vvfat_co_pwritev, - .bdrv_co_get_block_status = vvfat_co_get_block_status, + .bdrv_co_block_status = vvfat_co_block_status, }; static void bdrv_vvfat_init(void) From 636cb5125823a75ef6bbc9d63f8d7890576fdfec Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Feb 2018 14:27:01 -0600 Subject: [PATCH 21/38] block: Drop unused .bdrv_co_get_block_status() We are gradually moving away from sector-based interfaces, towards byte-based. Now that all drivers have been updated to provide the byte-based .bdrv_co_block_status(), we can delete the sector-based interface. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 50 ++++++++------------------------------- include/block/block_int.h | 3 --- 2 files changed, 10 insertions(+), 43 deletions(-) diff --git a/block/io.c b/block/io.c index 5bae79f282e5..4c3dba09730d 100644 --- a/block/io.c +++ b/block/io.c @@ -1963,7 +1963,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, /* Must be non-NULL or bdrv_getlength() would have failed */ assert(bs->drv); - if (!bs->drv->bdrv_co_get_block_status && !bs->drv->bdrv_co_block_status) { + if (!bs->drv->bdrv_co_block_status) { *pnum = bytes; ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; if (offset + bytes == total_size) { @@ -1981,53 +1981,23 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, /* Round out to request_alignment boundaries */ align = bs->bl.request_alignment; - if (bs->drv->bdrv_co_get_block_status && align < BDRV_SECTOR_SIZE) { - align = BDRV_SECTOR_SIZE; - } aligned_offset = QEMU_ALIGN_DOWN(offset, align); aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; - if (bs->drv->bdrv_co_get_block_status) { - int count; /* sectors */ - int64_t longret; - - assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes, - BDRV_SECTOR_SIZE)); - /* - * The contract allows us to return pnum smaller than bytes, even - * if the next query would see the same status; we truncate the - * request to avoid overflowing the driver's 32-bit interface. - */ - longret = bs->drv->bdrv_co_get_block_status( - bs, aligned_offset >> BDRV_SECTOR_BITS, - MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count, - &local_file); - if (longret < 0) { - assert(INT_MIN <= longret); - ret = longret; - goto out; - } - if (longret & BDRV_BLOCK_OFFSET_VALID) { - local_map = longret & BDRV_BLOCK_OFFSET_MASK; - } - ret = longret & ~BDRV_BLOCK_OFFSET_MASK; - *pnum = count * BDRV_SECTOR_SIZE; - } else { - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); - if (ret < 0) { - *pnum = 0; - goto out; - } - assert(*pnum); /* The block driver must make progress */ + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + if (ret < 0) { + *pnum = 0; + goto out; } /* - * The driver's result must be a multiple of request_alignment. + * The driver's result must be a non-zero multiple of request_alignment. * Clamp pnum and adjust map to original request. */ - assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset); + assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && + align > offset - aligned_offset); *pnum -= offset - aligned_offset; if (*pnum > bytes) { *pnum = bytes; diff --git a/include/block/block_int.h b/include/block/block_int.h index bf2598856cf1..5ae7738cf8df 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -215,9 +215,6 @@ struct BlockDriver { * as well as non-NULL pnum, map, and file; in turn, the driver * must return an error or set pnum to an aligned non-zero value. */ - int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, - BlockDriverState **file); int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file); From 18a59f03c36214105477e9bbaed5a99355718d27 Mon Sep 17 00:00:00 2001 From: Anton Nefedov Date: Wed, 14 Feb 2018 19:09:20 +0300 Subject: [PATCH 22/38] block: fix write with zero flag set and iovector provided The normal bdrv_co_pwritev() use is either - BDRV_REQ_ZERO_WRITE clear and iovector provided - BDRV_REQ_ZERO_WRITE set and iovector == NULL while - the flag clear and iovector == NULL is an assertion failure in bdrv_co_do_zero_pwritev() - the flag set and iovector provided is in fact allowed (the flag prevails and zeroes are written) However the alignment logic does not support the latter case so the padding areas get overwritten with zeroes. Currently, general functions like bdrv_rw_co() do provide iovector regardless of flags. So, keep it supported and use bdrv_co_do_zero_pwritev() alignment for it which also makes the code a bit more obvious anyway. Signed-off-by: Anton Nefedov Reviewed-by: Eric Blake Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index 4c3dba09730d..4d3d1f640a39 100644 --- a/block/io.c +++ b/block/io.c @@ -1701,7 +1701,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, */ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); - if (!qiov) { + if (flags & BDRV_REQ_ZERO_WRITE) { ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); goto out; } From afe35cde6c2f3bdcee6e1f878836a64174fccb17 Mon Sep 17 00:00:00 2001 From: Anton Nefedov Date: Wed, 14 Feb 2018 19:09:19 +0300 Subject: [PATCH 23/38] iotest 033: add misaligned write-zeroes test via truncate This new test case only makes sense for qcow2 while iotest 033 is generic; however it matches the test purpose perfectly and also 033 contains those do_test() tricks to pass the alignment, which won't look nice being duplicated in other tests or moved to the common code. Signed-off-by: Anton Nefedov Signed-off-by: Kevin Wolf --- tests/qemu-iotests/033 | 29 +++++++++++++++++++++++++++++ tests/qemu-iotests/033.out | 13 +++++++++++++ 2 files changed, 42 insertions(+) diff --git a/tests/qemu-iotests/033 b/tests/qemu-iotests/033 index 2cdfd1397aff..a1d8357331d9 100755 --- a/tests/qemu-iotests/033 +++ b/tests/qemu-iotests/033 @@ -64,6 +64,9 @@ do_test() } | $QEMU_IO $IO_EXTRA_ARGS } +echo +echo "=== Test aligned and misaligned write zeroes operations ===" + for write_zero_cmd in "write -z" "aio_write -z"; do for align in 512 4k; do echo @@ -102,7 +105,33 @@ for align in 512 4k; do done done + +# Trigger truncate that would shrink qcow2 L1 table, which is done by +# clearing one entry (8 bytes) with bdrv_co_pwrite_zeroes() + +echo +echo "=== Test misaligned write zeroes via truncate ===" +echo + +# any size will do, but the smaller the size the smaller the required image +CLUSTER_SIZE=$((4 * 1024)) +L2_COVERAGE=$(($CLUSTER_SIZE * $CLUSTER_SIZE / 8)) +_make_test_img $(($L2_COVERAGE * 2)) + +do_test 512 "write -P 1 0 0x200" "$TEST_IMG" | _filter_qemu_io +# next L2 table +do_test 512 "write -P 1 $L2_COVERAGE 0x200" "$TEST_IMG" | _filter_qemu_io + +# only interested in qcow2 here; also other formats might respond with +# "not supported" error message +if [ $IMGFMT = "qcow2" ]; then + do_test 512 "truncate $L2_COVERAGE" "$TEST_IMG" | _filter_qemu_io +fi + +do_test 512 "read -P 1 0 0x200" "$TEST_IMG" | _filter_qemu_io + # success, all done +echo echo "*** done" rm -f $seq.full status=0 diff --git a/tests/qemu-iotests/033.out b/tests/qemu-iotests/033.out index 95929eff703f..9683f6b29049 100644 --- a/tests/qemu-iotests/033.out +++ b/tests/qemu-iotests/033.out @@ -1,6 +1,8 @@ QA output created by 033 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +=== Test aligned and misaligned write zeroes operations === + == preparing image == wrote 1024/1024 bytes at offset 512 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) @@ -164,4 +166,15 @@ read 512/512 bytes at offset 512 read 3072/3072 bytes at offset 1024 3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Test misaligned write zeroes via truncate === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4194304 +wrote 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 512/512 bytes at offset 2097152 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + *** done From 156b46ded3853dfc6b34c5afae019ff61798491b Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Wed, 21 Feb 2018 16:08:49 +0200 Subject: [PATCH 24/38] specs/qcow2: Fix documentation of the compressed cluster descriptor This patch fixes several mistakes in the documentation of the compressed cluster descriptor: 1) the documentation claims that the cluster descriptor contains the number of sectors used to store the compressed data, but what it actually contains is the number of sectors *minus one* or, in other words, the number of additional sectors after the first one. 2) the width of the fields is incorrectly specified. The number of bits used by each field is x = 62 - (cluster_bits - 8) for the offset field y = (cluster_bits - 8) for the size field So the offset field's location is [0, x-1], not [0, x] as stated. 3) the size field does not contain the size of the compressed data, but rather the number of sectors where that data is stored. The compressed data starts at the exact point specified in the offset field and ends when there's enough data to produce a cluster of decompressed data. Both points can be in the middle of a sector, allowing several compressed clusters to be stored next to one another, sharing sectors if necessary. Cc: qemu-stable@nongnu.org Signed-off-by: Alberto Garcia Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- docs/interop/qcow2.txt | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index d7fdb1fee313..feb711fb6a88 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -426,10 +426,20 @@ Standard Cluster Descriptor: Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): - Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a - cluster boundary! + Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a + cluster or sector boundary! - x+1 - 61: Compressed size of the images in sectors of 512 bytes + x - 61: Number of additional 512-byte sectors used for the + compressed data, beyond the sector containing the offset + in the previous field. Some of these sectors may reside + in the next contiguous host cluster. + + Note that the compressed data does not necessarily occupy + all of the bytes in the final sector; rather, decompression + stops when it has produced a cluster of data. + + Another compressed cluster may map to the tail of the final + sector used by this compressed cluster. If a cluster is unallocated, read requests shall read the data from the backing file (except if bit 0 in the Standard Cluster Descriptor is set). If there is From be820971ff6f88c156233eec8a50cdf8c70c0938 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 19 Feb 2018 16:54:59 +0200 Subject: [PATCH 25/38] docs: document how to use the l2-cache-entry-size parameter This patch updates docs/qcow2-cache.txt explaining how to use the new l2-cache-entry-size parameter. Here's a more detailed technical description of this feature: https://lists.gnu.org/archive/html/qemu-block/2017-09/msg00635.html And here are some performance numbers: https://lists.gnu.org/archive/html/qemu-block/2017-12/msg00507.html Signed-off-by: Alberto Garcia Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- docs/qcow2-cache.txt | 46 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt index b0571de4b826..170191a242bf 100644 --- a/docs/qcow2-cache.txt +++ b/docs/qcow2-cache.txt @@ -1,6 +1,6 @@ qcow2 L2/refcount cache configuration ===================================== -Copyright (C) 2015 Igalia, S.L. +Copyright (C) 2015, 2018 Igalia, S.L. Author: Alberto Garcia This work is licensed under the terms of the GNU GPL, version 2 or @@ -118,8 +118,8 @@ There are three options available, and all of them take bytes: There are two things that need to be taken into account: - - Both caches must have a size that is a multiple of the cluster - size. + - Both caches must have a size that is a multiple of the cluster size + (or the cache entry size: see "Using smaller cache sizes" below). - If you only set one of the options above, QEMU will automatically adjust the others so that the L2 cache is 4 times bigger than the @@ -143,6 +143,46 @@ much less often than the L2 cache, so it's perfectly reasonable to keep it small. +Using smaller cache entries +--------------------------- +The qcow2 L2 cache stores complete tables by default. This means that +if QEMU needs an entry from an L2 table then the whole table is read +from disk and is kept in the cache. If the cache is full then a +complete table needs to be evicted first. + +This can be inefficient with large cluster sizes since it results in +more disk I/O and wastes more cache memory. + +Since QEMU 2.12 you can change the size of the L2 cache entry and make +it smaller than the cluster size. This can be configured using the +"l2-cache-entry-size" parameter: + + -drive file=hd.qcow2,l2-cache-size=2097152,l2-cache-entry-size=4096 + +Some things to take into account: + + - The L2 cache entry size has the same restrictions as the cluster + size (power of two, at least 512 bytes). + + - Smaller entry sizes generally improve the cache efficiency and make + disk I/O faster. This is particularly true with solid state drives + so it's a good idea to reduce the entry size in those cases. With + rotating hard drives the situation is a bit more complicated so you + should test it first and stay with the default size if unsure. + + - Try different entry sizes to see which one gives faster performance + in your case. The block size of the host filesystem is generally a + good default (usually 4096 bytes in the case of ext4). + + - Only the L2 cache can be configured this way. The refcount cache + always uses the cluster size as the entry size. + + - If the L2 cache is big enough to hold all of the image's L2 tables + (as explained in the "Choosing the right cache sizes" section + earlier in this document) then none of this is necessary and you + can omit the "l2-cache-entry-size" parameter altogether. + + Reducing the memory usage ------------------------- It is possible to clean unused cache entries in order to reduce the From d2b63ba8dd20c1091b3f1033e6a95ef95b18149d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 16 Feb 2018 16:50:11 +0000 Subject: [PATCH 26/38] aio: rename aio_context_in_iothread() to in_aio_context_home_thread() The name aio_context_in_iothread() is misleading because it also returns true when called on the main AioContext from the main loop thread, which is not an IOThread. This patch renames it to in_aio_context_home_thread() and expands the doc comment to make the semantics clearer. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- include/block/aio.h | 7 +++++-- include/block/block.h | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/block/aio.h b/include/block/aio.h index e9aeeaec9448..a1d6b9e24939 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -534,11 +534,14 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co); AioContext *qemu_get_current_aio_context(void); /** + * in_aio_context_home_thread: * @ctx: the aio context * - * Return whether we are running in the I/O thread that manages @ctx. + * Return whether we are running in the thread that normally runs @ctx. Note + * that acquiring/releasing ctx does not affect the outcome, each AioContext + * still only has one home thread that is responsible for running it. */ -static inline bool aio_context_in_iothread(AioContext *ctx) +static inline bool in_aio_context_home_thread(AioContext *ctx) { return ctx == qemu_get_current_aio_context(); } diff --git a/include/block/block.h b/include/block/block.h index 947e8876cdd7..bc41ed253b5d 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -372,7 +372,7 @@ void bdrv_drain_all(void); bool busy_ = true; \ BlockDriverState *bs_ = (bs); \ AioContext *ctx_ = bdrv_get_aio_context(bs_); \ - if (aio_context_in_iothread(ctx_)) { \ + if (in_aio_context_home_thread(ctx_)) { \ while ((cond) || busy_) { \ busy_ = aio_poll(ctx_, (cond)); \ waited_ |= !!(cond) | busy_; \ From 7719f3c968c59e1bcda7e177679dc765b59e578f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 16 Feb 2018 16:50:12 +0000 Subject: [PATCH 27/38] block: extract AIO_WAIT_WHILE() from BlockDriverState BlockDriverState has the BDRV_POLL_WHILE() macro to wait on event loop activity while a condition evaluates to true. This is used to implement synchronous operations where it acts as a condvar between the IOThread running the operation and the main loop waiting for the operation. It can also be called from the thread that owns the AioContext and in that case it's just a nested event loop. BlockBackend needs this behavior but doesn't always have a BlockDriverState it can use. This patch extracts BDRV_POLL_WHILE() into the AioWait abstraction, which can be used with AioContext and isn't tied to BlockDriverState anymore. This feature could be built directly into AioContext but then all users would kick the event loop even if they signal different conditions. Imagine an AioContext with many BlockDriverStates, each time a request completes any waiter would wake up and re-check their condition. It's nicer to keep a separate AioWait object for each condition instead. Please see "block/aio-wait.h" for details on the API. The name AIO_WAIT_WHILE() avoids the confusion between AIO_POLL_WHILE() and AioContext polling. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block.c | 5 ++ block/io.c | 10 +--- include/block/aio-wait.h | 116 ++++++++++++++++++++++++++++++++++++++ include/block/block.h | 40 +++---------- include/block/block_int.h | 7 +-- util/Makefile.objs | 2 +- util/aio-wait.c | 40 +++++++++++++ 7 files changed, 174 insertions(+), 46 deletions(-) create mode 100644 include/block/aio-wait.h create mode 100644 util/aio-wait.c diff --git a/block.c b/block.c index 814e5a02da69..9e4da812138e 100644 --- a/block.c +++ b/block.c @@ -4716,6 +4716,11 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs) return bs->aio_context; } +AioWait *bdrv_get_aio_wait(BlockDriverState *bs) +{ + return bs ? &bs->wait : NULL; +} + void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co) { aio_co_enter(bdrv_get_aio_context(bs), co); diff --git a/block/io.c b/block/io.c index 4d3d1f640a39..2b09c656d089 100644 --- a/block/io.c +++ b/block/io.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "trace.h" #include "sysemu/block-backend.h" +#include "block/aio-wait.h" #include "block/blockjob.h" #include "block/blockjob_int.h" #include "block/block_int.h" @@ -587,16 +588,9 @@ void bdrv_inc_in_flight(BlockDriverState *bs) atomic_inc(&bs->in_flight); } -static void dummy_bh_cb(void *opaque) -{ -} - void bdrv_wakeup(BlockDriverState *bs) { - /* The barrier (or an atomic op) is in the caller. */ - if (atomic_read(&bs->wakeup)) { - aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); - } + aio_wait_kick(bdrv_get_aio_wait(bs)); } void bdrv_dec_in_flight(BlockDriverState *bs) diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h new file mode 100644 index 000000000000..a48c744fa874 --- /dev/null +++ b/include/block/aio-wait.h @@ -0,0 +1,116 @@ +/* + * AioContext wait support + * + * Copyright (C) 2018 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_AIO_WAIT_H +#define QEMU_AIO_WAIT_H + +#include "block/aio.h" + +/** + * AioWait: + * + * An object that facilitates synchronous waiting on a condition. The main + * loop can wait on an operation running in an IOThread as follows: + * + * AioWait *wait = ...; + * AioContext *ctx = ...; + * MyWork work = { .done = false }; + * schedule_my_work_in_iothread(ctx, &work); + * AIO_WAIT_WHILE(wait, ctx, !work.done); + * + * The IOThread must call aio_wait_kick() to notify the main loop when + * work.done changes: + * + * static void do_work(...) + * { + * ... + * work.done = true; + * aio_wait_kick(wait); + * } + */ +typedef struct { + /* Is the main loop waiting for a kick? Accessed with atomic ops. */ + bool need_kick; +} AioWait; + +/** + * AIO_WAIT_WHILE: + * @wait: the aio wait object + * @ctx: the aio context + * @cond: wait while this conditional expression is true + * + * Wait while a condition is true. Use this to implement synchronous + * operations that require event loop activity. + * + * The caller must be sure that something calls aio_wait_kick() when the value + * of @cond might have changed. + * + * The caller's thread must be the IOThread that owns @ctx or the main loop + * thread (with @ctx acquired exactly once). This function cannot be used to + * wait on conditions between two IOThreads since that could lead to deadlock, + * go via the main loop instead. + */ +#define AIO_WAIT_WHILE(wait, ctx, cond) ({ \ + bool waited_ = false; \ + bool busy_ = true; \ + AioWait *wait_ = (wait); \ + AioContext *ctx_ = (ctx); \ + if (in_aio_context_home_thread(ctx_)) { \ + while ((cond) || busy_) { \ + busy_ = aio_poll(ctx_, (cond)); \ + waited_ |= !!(cond) | busy_; \ + } \ + } else { \ + assert(qemu_get_current_aio_context() == \ + qemu_get_aio_context()); \ + assert(!wait_->need_kick); \ + /* Set wait_->need_kick before evaluating cond. */ \ + atomic_mb_set(&wait_->need_kick, true); \ + while (busy_) { \ + if ((cond)) { \ + waited_ = busy_ = true; \ + aio_context_release(ctx_); \ + aio_poll(qemu_get_aio_context(), true); \ + aio_context_acquire(ctx_); \ + } else { \ + busy_ = aio_poll(ctx_, false); \ + waited_ |= busy_; \ + } \ + } \ + atomic_set(&wait_->need_kick, false); \ + } \ + waited_; }) + +/** + * aio_wait_kick: + * @wait: the aio wait object that should re-evaluate its condition + * + * Wake up the main thread if it is waiting on AIO_WAIT_WHILE(). During + * synchronous operations performed in an IOThread, the main thread lets the + * IOThread's event loop run, waiting for the operation to complete. A + * aio_wait_kick() call will wake up the main thread. + */ +void aio_wait_kick(AioWait *wait); + +#endif /* QEMU_AIO_WAIT */ diff --git a/include/block/block.h b/include/block/block.h index bc41ed253b5d..4a80a2acd488 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -2,6 +2,7 @@ #define BLOCK_H #include "block/aio.h" +#include "block/aio-wait.h" #include "qapi-types.h" #include "qemu/iov.h" #include "qemu/coroutine.h" @@ -367,41 +368,14 @@ void bdrv_drain_all_begin(void); void bdrv_drain_all_end(void); void bdrv_drain_all(void); +/* Returns NULL when bs == NULL */ +AioWait *bdrv_get_aio_wait(BlockDriverState *bs); + #define BDRV_POLL_WHILE(bs, cond) ({ \ - bool waited_ = false; \ - bool busy_ = true; \ BlockDriverState *bs_ = (bs); \ - AioContext *ctx_ = bdrv_get_aio_context(bs_); \ - if (in_aio_context_home_thread(ctx_)) { \ - while ((cond) || busy_) { \ - busy_ = aio_poll(ctx_, (cond)); \ - waited_ |= !!(cond) | busy_; \ - } \ - } else { \ - assert(qemu_get_current_aio_context() == \ - qemu_get_aio_context()); \ - /* Ask bdrv_dec_in_flight to wake up the main \ - * QEMU AioContext. Extra I/O threads never take \ - * other I/O threads' AioContexts (see for example \ - * block_job_defer_to_main_loop for how to do it). \ - */ \ - assert(!bs_->wakeup); \ - /* Set bs->wakeup before evaluating cond. */ \ - atomic_mb_set(&bs_->wakeup, true); \ - while (busy_) { \ - if ((cond)) { \ - waited_ = busy_ = true; \ - aio_context_release(ctx_); \ - aio_poll(qemu_get_aio_context(), true); \ - aio_context_acquire(ctx_); \ - } else { \ - busy_ = aio_poll(ctx_, false); \ - waited_ |= busy_; \ - } \ - } \ - atomic_set(&bs_->wakeup, false); \ - } \ - waited_; }) + AIO_WAIT_WHILE(bdrv_get_aio_wait(bs_), \ + bdrv_get_aio_context(bs_), \ + cond); }) int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes); int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes); diff --git a/include/block/block_int.h b/include/block/block_int.h index 5ae7738cf8df..aef10296b078 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -26,6 +26,7 @@ #include "block/accounting.h" #include "block/block.h" +#include "block/aio-wait.h" #include "qemu/queue.h" #include "qemu/coroutine.h" #include "qemu/stats64.h" @@ -716,10 +717,8 @@ struct BlockDriverState { unsigned int in_flight; unsigned int serialising_in_flight; - /* Internal to BDRV_POLL_WHILE and bdrv_wakeup. Accessed with atomic - * ops. - */ - bool wakeup; + /* Kicked to signal main loop when a request completes. */ + AioWait wait; /* counter for nested bdrv_io_plug. * Accessed with atomic ops. diff --git a/util/Makefile.objs b/util/Makefile.objs index 3fb611631fe2..ae90b9963da8 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -1,7 +1,7 @@ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o util-obj-y += bufferiszero.o util-obj-y += lockcnt.o -util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o +util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o util-obj-y += main-loop.o iohandler.o util-obj-$(CONFIG_POSIX) += aio-posix.o util-obj-$(CONFIG_POSIX) += compatfd.o diff --git a/util/aio-wait.c b/util/aio-wait.c new file mode 100644 index 000000000000..a487cdb8523a --- /dev/null +++ b/util/aio-wait.c @@ -0,0 +1,40 @@ +/* + * AioContext wait support + * + * Copyright (C) 2018 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "block/aio-wait.h" + +static void dummy_bh_cb(void *opaque) +{ + /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */ +} + +void aio_wait_kick(AioWait *wait) +{ + /* The barrier (or an atomic op) is in the caller. */ + if (atomic_read(&wait->need_kick)) { + aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); + } +} From 33f2a7577787910bda161f428c904ac6a14b2454 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 16 Feb 2018 16:50:13 +0000 Subject: [PATCH 28/38] block: add BlockBackend->in_flight counter BlockBackend currently relies on BlockDriverState->in_flight to track requests for blk_drain(). There is a corner case where BlockDriverState->in_flight cannot be used though: blk->root can be NULL when there is no medium. This results in a segfault when the NULL pointer is dereferenced. Introduce a BlockBackend->in_flight counter for aio requests so it works even when blk->root == NULL. Based on a patch by Kevin Wolf . Signed-off-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block.c | 2 +- block/block-backend.c | 60 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 9e4da812138e..a83037c2a5b4 100644 --- a/block.c +++ b/block.c @@ -4713,7 +4713,7 @@ void bdrv_img_create(const char *filename, const char *fmt, AioContext *bdrv_get_aio_context(BlockDriverState *bs) { - return bs->aio_context; + return bs ? bs->aio_context : qemu_get_aio_context(); } AioWait *bdrv_get_aio_wait(BlockDriverState *bs) diff --git a/block/block-backend.c b/block/block-backend.c index 0266ac990b3f..a775a3dd2f73 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -73,6 +73,14 @@ struct BlockBackend { int quiesce_counter; VMChangeStateEntry *vmsh; bool force_allow_inactivate; + + /* Number of in-flight aio requests. BlockDriverState also counts + * in-flight requests but aio requests can exist even when blk->root is + * NULL, so we cannot rely on its counter for that case. + * Accessed with atomic ops. + */ + unsigned int in_flight; + AioWait wait; }; typedef struct BlockBackendAIOCB { @@ -1225,11 +1233,22 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) return bdrv_make_zero(blk->root, flags); } +static void blk_inc_in_flight(BlockBackend *blk) +{ + atomic_inc(&blk->in_flight); +} + +static void blk_dec_in_flight(BlockBackend *blk) +{ + atomic_dec(&blk->in_flight); + aio_wait_kick(&blk->wait); +} + static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; - bdrv_dec_in_flight(acb->common.bs); + blk_dec_in_flight(acb->blk); acb->common.cb(acb->common.opaque, acb->ret); qemu_aio_unref(acb); } @@ -1240,7 +1259,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, { struct BlockBackendAIOCB *acb; - bdrv_inc_in_flight(blk_bs(blk)); + blk_inc_in_flight(blk); acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); acb->blk = blk; acb->ret = ret; @@ -1263,7 +1282,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = { static void blk_aio_complete(BlkAioEmAIOCB *acb) { if (acb->has_returned) { - bdrv_dec_in_flight(acb->common.bs); + blk_dec_in_flight(acb->rwco.blk); acb->common.cb(acb->common.opaque, acb->rwco.ret); qemu_aio_unref(acb); } @@ -1284,7 +1303,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, BlkAioEmAIOCB *acb; Coroutine *co; - bdrv_inc_in_flight(blk_bs(blk)); + blk_inc_in_flight(blk); acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); acb->rwco = (BlkRwCo) { .blk = blk, @@ -1521,14 +1540,41 @@ int blk_flush(BlockBackend *blk) void blk_drain(BlockBackend *blk) { - if (blk_bs(blk)) { - bdrv_drain(blk_bs(blk)); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_drained_begin(bs); + } + + /* We may have -ENOMEDIUM completions in flight */ + AIO_WAIT_WHILE(&blk->wait, + blk_get_aio_context(blk), + atomic_mb_read(&blk->in_flight) > 0); + + if (bs) { + bdrv_drained_end(bs); } } void blk_drain_all(void) { - bdrv_drain_all(); + BlockBackend *blk = NULL; + + bdrv_drain_all_begin(); + + while ((blk = blk_all_next(blk)) != NULL) { + AioContext *ctx = blk_get_aio_context(blk); + + aio_context_acquire(ctx); + + /* We may have -ENOMEDIUM completions in flight */ + AIO_WAIT_WHILE(&blk->wait, ctx, + atomic_mb_read(&blk->in_flight) > 0); + + aio_context_release(ctx); + } + + bdrv_drain_all_end(); } void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, From ad0df3e0fdac1d28e99ad29b99540d9f125e7ccf Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 16 Feb 2018 16:50:14 +0000 Subject: [PATCH 29/38] block: test blk_aio_flush() with blk->root == NULL This patch adds test cases for the scenario where blk_aio_flush() is called on a BlockBackend with no root. Calling drain afterwards should complete the requests with -ENOMEDIUM. Signed-off-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/Makefile.include | 2 + tests/test-block-backend.c | 82 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/test-block-backend.c diff --git a/tests/Makefile.include b/tests/Makefile.include index 937cbd874a03..b5aab848b3ea 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -83,6 +83,7 @@ gcov-files-test-hbitmap-y = blockjob.c check-unit-y += tests/test-bdrv-drain$(EXESUF) check-unit-y += tests/test-blockjob$(EXESUF) check-unit-y += tests/test-blockjob-txn$(EXESUF) +check-unit-y += tests/test-block-backend$(EXESUF) check-unit-y += tests/test-x86-cpuid$(EXESUF) # all code tested by test-x86-cpuid is inside topology.h gcov-files-test-x86-cpuid-y = @@ -613,6 +614,7 @@ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y) tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y) tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y) tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y) +tests/test-block-backend$(EXESUF): tests/test-block-backend.o $(test-block-obj-y) $(test-util-obj-y) tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y) tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y) tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y) $(test-crypto-obj-y) diff --git a/tests/test-block-backend.c b/tests/test-block-backend.c new file mode 100644 index 000000000000..fd59f02bd0da --- /dev/null +++ b/tests/test-block-backend.c @@ -0,0 +1,82 @@ +/* + * BlockBackend tests + * + * Copyright (c) 2017 Kevin Wolf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "block/block.h" +#include "sysemu/block-backend.h" +#include "qapi/error.h" + +static void test_drain_aio_error_flush_cb(void *opaque, int ret) +{ + bool *completed = opaque; + + g_assert(ret == -ENOMEDIUM); + *completed = true; +} + +static void test_drain_aio_error(void) +{ + BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); + BlockAIOCB *acb; + bool completed = false; + + acb = blk_aio_flush(blk, test_drain_aio_error_flush_cb, &completed); + g_assert(acb != NULL); + g_assert(completed == false); + + blk_drain(blk); + g_assert(completed == true); + + blk_unref(blk); +} + +static void test_drain_all_aio_error(void) +{ + BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); + BlockAIOCB *acb; + bool completed = false; + + acb = blk_aio_flush(blk, test_drain_aio_error_flush_cb, &completed); + g_assert(acb != NULL); + g_assert(completed == false); + + blk_drain_all(); + g_assert(completed == true); + + blk_unref(blk); +} + +int main(int argc, char **argv) +{ + bdrv_init(); + qemu_init_main_loop(&error_abort); + + g_test_init(&argc, &argv, NULL); + + g_test_add_func("/block-backend/drain_aio_error", test_drain_aio_error); + g_test_add_func("/block-backend/drain_all_aio_error", + test_drain_all_aio_error); + + return g_test_run(); +} From 13471a40c13bc899c73f1879681da29da292547e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 16 Feb 2018 16:50:15 +0000 Subject: [PATCH 30/38] Revert "IDE: Do not flush empty CDROM drives" This reverts commit 4da97120d51a4383aa96d741a2b837f8c4bbcd0b. blk_aio_flush() now handles the blk->root == NULL case, so we no longer need this workaround. Cc: John Snow Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- hw/ide/core.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index 257b429381e0..139c84351499 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -1087,15 +1087,7 @@ static void ide_flush_cache(IDEState *s) s->status |= BUSY_STAT; ide_set_retry(s); block_acct_start(blk_get_stats(s->blk), &s->acct, 0, BLOCK_ACCT_FLUSH); - - if (blk_bs(s->blk)) { - s->pio_aiocb = blk_aio_flush(s->blk, ide_flush_cb, s); - } else { - /* XXX blk_aio_flush() crashes when blk_bs(blk) is NULL, remove this - * temporary workaround when blk_aio_*() functions handle NULL blk_bs. - */ - ide_flush_cb(s, 0); - } + s->pio_aiocb = blk_aio_flush(s->blk, ide_flush_cb, s); } static void ide_cfata_metadata_inquiry(IDEState *s) From efc75e2a4cf7dfa62c7ccaa9a1016f27e5519003 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 18 Jan 2018 13:43:45 +0100 Subject: [PATCH 31/38] block: rename .bdrv_create() to .bdrv_co_create_opts() BlockDriver->bdrv_create() has been called from coroutine context since commit 5b7e1542cfa41a281af9629d31cef03704d976e6 ("block: make bdrv_create adopt coroutine"). Make this explicit by renaming to .bdrv_co_create_opts() and add the coroutine_fn annotation. This makes it obvious to block driver authors that they may yield, use CoMutex, or other coroutine_fn APIs. bdrv_co_create is reserved for the QAPI-based version that Kevin is working on. Signed-off-by: Stefan Hajnoczi Message-Id: <20170705102231.20711-2-stefanha@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block.c | 4 ++-- block/crypto.c | 8 ++++---- block/file-posix.c | 15 ++++++++------- block/file-win32.c | 5 +++-- block/gluster.c | 13 +++++++------ block/iscsi.c | 7 ++++--- block/nfs.c | 5 +++-- block/parallels.c | 6 ++++-- block/qcow.c | 5 +++-- block/qcow2.c | 5 +++-- block/qed.c | 6 ++++-- block/raw-format.c | 5 +++-- block/rbd.c | 6 ++++-- block/sheepdog.c | 10 +++++----- block/ssh.c | 5 +++-- block/vdi.c | 5 +++-- block/vhdx.c | 5 +++-- block/vmdk.c | 5 +++-- block/vpc.c | 5 +++-- include/block/block_int.h | 3 ++- 20 files changed, 74 insertions(+), 54 deletions(-) diff --git a/block.c b/block.c index a83037c2a5b4..86dd809041b3 100644 --- a/block.c +++ b/block.c @@ -420,7 +420,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque) CreateCo *cco = opaque; assert(cco->drv); - ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err); + ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err); error_propagate(&cco->err, local_err); cco->ret = ret; } @@ -439,7 +439,7 @@ int bdrv_create(BlockDriver *drv, const char* filename, .err = NULL, }; - if (!drv->bdrv_create) { + if (!drv->bdrv_co_create_opts) { error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); ret = -ENOTSUP; goto out; diff --git a/block/crypto.c b/block/crypto.c index 3df66947c5a4..2ea116e6dbbe 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -556,9 +556,9 @@ static int block_crypto_open_luks(BlockDriverState *bs, bs, options, flags, errp); } -static int block_crypto_create_luks(const char *filename, - QemuOpts *opts, - Error **errp) +static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename, + QemuOpts *opts, + Error **errp) { return block_crypto_create_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, filename, opts, errp); @@ -617,7 +617,7 @@ BlockDriver bdrv_crypto_luks = { .bdrv_open = block_crypto_open_luks, .bdrv_close = block_crypto_close, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_create = block_crypto_create_luks, + .bdrv_co_create_opts = block_crypto_co_create_opts_luks, .bdrv_truncate = block_crypto_truncate, .create_opts = &block_crypto_create_opts_luks, diff --git a/block/file-posix.c b/block/file-posix.c index f1591c38490c..7f2cc63c60b8 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1982,7 +1982,8 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return (int64_t)st.st_blocks * 512; } -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int fd; int result = 0; @@ -2276,7 +2277,7 @@ BlockDriver bdrv_file = { .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, .bdrv_close = raw_close, - .bdrv_create = raw_create, + .bdrv_co_create_opts = raw_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_block_status = raw_co_block_status, .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, @@ -2680,8 +2681,8 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } -static int hdev_create(const char *filename, QemuOpts *opts, - Error **errp) +static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int fd; int ret = 0; @@ -2754,7 +2755,7 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, + .bdrv_co_create_opts = hdev_co_create_opts, .create_opts = &raw_create_opts, .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, @@ -2876,7 +2877,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, + .bdrv_co_create_opts = hdev_co_create_opts, .create_opts = &raw_create_opts, @@ -3007,7 +3008,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, + .bdrv_co_create_opts = hdev_co_create_opts, .create_opts = &raw_create_opts, .bdrv_co_preadv = raw_co_preadv, diff --git a/block/file-win32.c b/block/file-win32.c index f24c7bb92c64..4a430d45f1d1 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -553,7 +553,8 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return st.st_size; } -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int fd; int64_t total_size = 0; @@ -599,7 +600,7 @@ BlockDriver bdrv_file = { .bdrv_file_open = raw_open, .bdrv_refresh_limits = raw_probe_alignment, .bdrv_close = raw_close, - .bdrv_create = raw_create, + .bdrv_co_create_opts = raw_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_aio_readv = raw_aio_readv, diff --git a/block/gluster.c b/block/gluster.c index 1a07d221d173..79b4cfdf74ed 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1021,8 +1021,9 @@ static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset, return 0; } -static int qemu_gluster_create(const char *filename, - QemuOpts *opts, Error **errp) +static int coroutine_fn qemu_gluster_co_create_opts(const char *filename, + QemuOpts *opts, + Error **errp) { BlockdevOptionsGluster *gconf; struct glfs *glfs; @@ -1435,7 +1436,7 @@ static BlockDriver bdrv_gluster = { .bdrv_reopen_commit = qemu_gluster_reopen_commit, .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, + .bdrv_co_create_opts = qemu_gluster_co_create_opts, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, @@ -1463,7 +1464,7 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_reopen_commit = qemu_gluster_reopen_commit, .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, + .bdrv_co_create_opts = qemu_gluster_co_create_opts, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, @@ -1491,7 +1492,7 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_reopen_commit = qemu_gluster_reopen_commit, .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, + .bdrv_co_create_opts = qemu_gluster_co_create_opts, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, @@ -1525,7 +1526,7 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_reopen_commit = qemu_gluster_reopen_commit, .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, + .bdrv_co_create_opts = qemu_gluster_co_create_opts, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, diff --git a/block/iscsi.c b/block/iscsi.c index c228ca21c8fe..07988ce7619c 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2117,7 +2117,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset, return 0; } -static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn iscsi_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int ret = 0; int64_t total_size = 0; @@ -2204,7 +2205,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_parse_filename = iscsi_parse_filename, .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, - .bdrv_create = iscsi_create, + .bdrv_co_create_opts = iscsi_co_create_opts, .create_opts = &iscsi_create_opts, .bdrv_reopen_prepare = iscsi_reopen_prepare, .bdrv_reopen_commit = iscsi_reopen_commit, @@ -2239,7 +2240,7 @@ static BlockDriver bdrv_iser = { .bdrv_parse_filename = iscsi_parse_filename, .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, - .bdrv_create = iscsi_create, + .bdrv_co_create_opts = iscsi_co_create_opts, .create_opts = &iscsi_create_opts, .bdrv_reopen_prepare = iscsi_reopen_prepare, .bdrv_reopen_commit = iscsi_reopen_commit, diff --git a/block/nfs.c b/block/nfs.c index 6576a73d6eb9..ce6e195a4147 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -684,7 +684,8 @@ static QemuOptsList nfs_create_opts = { } }; -static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) +static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts, + Error **errp) { int64_t ret, total_size; NFSClient *client = g_new0(NFSClient, 1); @@ -897,7 +898,7 @@ static BlockDriver bdrv_nfs = { .bdrv_file_open = nfs_file_open, .bdrv_close = nfs_file_close, - .bdrv_create = nfs_file_create, + .bdrv_co_create_opts = nfs_file_co_create_opts, .bdrv_reopen_prepare = nfs_reopen_prepare, .bdrv_co_preadv = nfs_co_preadv, diff --git a/block/parallels.c b/block/parallels.c index 3e952a9c147a..81085795c239 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -475,7 +475,9 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, } -static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn parallels_co_create_opts(const char *filename, + QemuOpts *opts, + Error **errp) { int64_t total_size, cl_size; uint8_t tmp[BDRV_SECTOR_SIZE]; @@ -796,7 +798,7 @@ static BlockDriver bdrv_parallels = { .bdrv_co_readv = parallels_co_readv, .bdrv_co_writev = parallels_co_writev, .supports_backing = true, - .bdrv_create = parallels_create, + .bdrv_co_create_opts = parallels_co_create_opts, .bdrv_check = parallels_check, .create_opts = ¶llels_create_opts, }; diff --git a/block/qcow.c b/block/qcow.c index dead5029c67d..47a18d9a3a3c 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -810,7 +810,8 @@ static void qcow_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn qcow_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int header_size, backing_filename_len, l1_size, shift, i; QCowHeader header; @@ -1127,7 +1128,7 @@ static BlockDriver bdrv_qcow = { .bdrv_close = qcow_close, .bdrv_child_perm = bdrv_format_default_perms, .bdrv_reopen_prepare = qcow_reopen_prepare, - .bdrv_create = qcow_create, + .bdrv_co_create_opts = qcow_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .supports_backing = true, diff --git a/block/qcow2.c b/block/qcow2.c index 288b5299d800..93fb625dcb82 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2916,7 +2916,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } -static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { char *backing_file = NULL; char *backing_fmt = NULL; @@ -4352,7 +4353,7 @@ BlockDriver bdrv_qcow2 = { .bdrv_reopen_abort = qcow2_reopen_abort, .bdrv_join_options = qcow2_join_options, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_create = qcow2_create, + .bdrv_co_create_opts = qcow2_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_block_status = qcow2_co_block_status, diff --git a/block/qed.c b/block/qed.c index a5952209261a..72cf2f58ab84 100644 --- a/block/qed.c +++ b/block/qed.c @@ -638,7 +638,9 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } -static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn bdrv_qed_co_create_opts(const char *filename, + QemuOpts *opts, + Error **errp) { uint64_t image_size = 0; uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; @@ -1564,7 +1566,7 @@ static BlockDriver bdrv_qed = { .bdrv_close = bdrv_qed_close, .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_create = bdrv_qed_create, + .bdrv_co_create_opts = bdrv_qed_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_block_status = bdrv_qed_co_block_status, .bdrv_co_readv = bdrv_qed_co_readv, diff --git a/block/raw-format.c b/block/raw-format.c index 830243a8e486..a378547c9983 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -396,7 +396,8 @@ static int raw_has_zero_init(BlockDriverState *bs) return bdrv_has_zero_init(bs->file->bs); } -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { return bdrv_create_file(filename, opts, errp); } @@ -491,7 +492,7 @@ BlockDriver bdrv_raw = { .bdrv_open = &raw_open, .bdrv_close = &raw_close, .bdrv_child_perm = bdrv_filter_default_perms, - .bdrv_create = &raw_create, + .bdrv_co_create_opts = &raw_co_create_opts, .bdrv_co_preadv = &raw_co_preadv, .bdrv_co_pwritev = &raw_co_pwritev, .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, diff --git a/block/rbd.c b/block/rbd.c index 8474b0ba117c..c7dd32e213b2 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -351,7 +351,9 @@ static QemuOptsList runtime_opts = { }, }; -static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, + QemuOpts *opts, + Error **errp) { Error *local_err = NULL; int64_t bytes = 0; @@ -1132,7 +1134,7 @@ static BlockDriver bdrv_rbd = { .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_reopen_prepare = qemu_rbd_reopen_prepare, - .bdrv_create = qemu_rbd_create, + .bdrv_co_create_opts = qemu_rbd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_get_info = qemu_rbd_getinfo, .create_opts = &qemu_rbd_create_opts, diff --git a/block/sheepdog.c b/block/sheepdog.c index 3c3becf94df0..cef6faec4db0 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1959,8 +1959,8 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) return 0; } -static int sd_create(const char *filename, QemuOpts *opts, - Error **errp) +static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { Error *err = NULL; int ret = 0; @@ -3103,7 +3103,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_reopen_commit = sd_reopen_commit, .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, - .bdrv_create = sd_create, + .bdrv_co_create_opts = sd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_get_allocated_file_size = sd_get_allocated_file_size, @@ -3139,7 +3139,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_reopen_commit = sd_reopen_commit, .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, - .bdrv_create = sd_create, + .bdrv_co_create_opts = sd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_get_allocated_file_size = sd_get_allocated_file_size, @@ -3175,7 +3175,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_reopen_commit = sd_reopen_commit, .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, - .bdrv_create = sd_create, + .bdrv_co_create_opts = sd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_get_allocated_file_size = sd_get_allocated_file_size, diff --git a/block/ssh.c b/block/ssh.c index b63addcf9483..36d5d888d57d 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -816,7 +816,8 @@ static QemuOptsList ssh_create_opts = { } }; -static int ssh_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int r, ret; int64_t total_size = 0; @@ -1204,7 +1205,7 @@ static BlockDriver bdrv_ssh = { .instance_size = sizeof(BDRVSSHState), .bdrv_parse_filename = ssh_parse_filename, .bdrv_file_open = ssh_file_open, - .bdrv_create = ssh_create, + .bdrv_co_create_opts = ssh_co_create_opts, .bdrv_close = ssh_close, .bdrv_has_zero_init = ssh_has_zero_init, .bdrv_co_readv = ssh_co_readv, diff --git a/block/vdi.c b/block/vdi.c index 0780c82d8291..68592cc58d59 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -716,7 +716,8 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return ret; } -static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn vdi_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int ret = 0; uint64_t bytes = 0; @@ -894,7 +895,7 @@ static BlockDriver bdrv_vdi = { .bdrv_close = vdi_close, .bdrv_reopen_prepare = vdi_reopen_prepare, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_create = vdi_create, + .bdrv_co_create_opts = vdi_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_block_status = vdi_co_block_status, .bdrv_make_empty = vdi_make_empty, diff --git a/block/vhdx.c b/block/vhdx.c index c449c5dcfd87..3fbff5048b3b 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1792,7 +1792,8 @@ static int vhdx_create_new_region_table(BlockBackend *blk, * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. * 1MB */ -static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn vhdx_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int ret = 0; uint64_t image_size = (uint64_t) 2 * GiB; @@ -2003,7 +2004,7 @@ static BlockDriver bdrv_vhdx = { .bdrv_child_perm = bdrv_format_default_perms, .bdrv_co_readv = vhdx_co_readv, .bdrv_co_writev = vhdx_co_writev, - .bdrv_create = vhdx_create, + .bdrv_co_create_opts = vhdx_co_create_opts, .bdrv_get_info = vhdx_get_info, .bdrv_check = vhdx_check, .bdrv_has_zero_init = bdrv_has_zero_init_1, diff --git a/block/vmdk.c b/block/vmdk.c index 75f84213e6f6..67342ed69b60 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1882,7 +1882,8 @@ static int filename_decompose(const char *filename, char *path, char *prefix, return VMDK_OK; } -static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { int idx = 0; BlockBackend *new_blk = NULL; @@ -2398,7 +2399,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_co_pwritev_compressed = vmdk_co_pwritev_compressed, .bdrv_co_pwrite_zeroes = vmdk_co_pwrite_zeroes, .bdrv_close = vmdk_close, - .bdrv_create = vmdk_create, + .bdrv_co_create_opts = vmdk_co_create_opts, .bdrv_co_flush_to_disk = vmdk_co_flush, .bdrv_co_block_status = vmdk_co_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, diff --git a/block/vpc.c b/block/vpc.c index fba4492fd7b0..b2e2b9ebd43a 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -897,7 +897,8 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, return ret; } -static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) +static int coroutine_fn vpc_co_create_opts(const char *filename, QemuOpts *opts, + Error **errp) { uint8_t buf[1024]; VHDFooter *footer = (VHDFooter *) buf; @@ -1095,7 +1096,7 @@ static BlockDriver bdrv_vpc = { .bdrv_close = vpc_close, .bdrv_reopen_prepare = vpc_reopen_prepare, .bdrv_child_perm = bdrv_format_default_perms, - .bdrv_create = vpc_create, + .bdrv_co_create_opts = vpc_co_create_opts, .bdrv_co_preadv = vpc_co_preadv, .bdrv_co_pwritev = vpc_co_pwritev, diff --git a/include/block/block_int.h b/include/block/block_int.h index aef10296b078..64a5700f2b01 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -129,7 +129,8 @@ struct BlockDriver { int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, Error **errp); void (*bdrv_close)(BlockDriverState *bs); - int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp); + int coroutine_fn (*bdrv_co_create_opts)(const char *filename, QemuOpts *opts, + Error **errp); int (*bdrv_make_empty)(BlockDriverState *bs); void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options); From c274393a3e69d101203cc0f9bfde6c64bbb9689b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 18 Jan 2018 13:43:46 +0100 Subject: [PATCH 32/38] qcow2: make qcow2_co_create2() a coroutine_fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qcow2_create2() calls qemu_co_mutex_lock(). Only a coroutine_fn may call another coroutine_fn. In fact, qcow2_create2 is always called from coroutine context. Rename the function to add the "co" moniker and add coroutine_fn. Reported-by: Marc-André Lureau Signed-off-by: Stefan Hajnoczi Message-Id: <20170705102231.20711-3-stefanha@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/qcow2.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 93fb625dcb82..7cf3c1518a78 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2725,11 +2725,12 @@ static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version, return refcount_bits; } -static int qcow2_create2(const char *filename, int64_t total_size, - const char *backing_file, const char *backing_format, - int flags, size_t cluster_size, PreallocMode prealloc, - QemuOpts *opts, int version, int refcount_order, - const char *encryptfmt, Error **errp) +static int coroutine_fn +qcow2_co_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags, size_t cluster_size, PreallocMode prealloc, + QemuOpts *opts, int version, int refcount_order, + const char *encryptfmt, Error **errp) { QDict *options; @@ -2998,9 +2999,9 @@ static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opt refcount_order = ctz32(refcount_bits); - ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, - cluster_size, prealloc, opts, version, refcount_order, - encryptfmt, &local_err); + ret = qcow2_co_create2(filename, size, backing_file, backing_fmt, flags, + cluster_size, prealloc, opts, version, refcount_order, + encryptfmt, &local_err); error_propagate(errp, local_err); finish: From be8fbd476393089f21e6ee9086d6d3dad8cb4f3d Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 5 Feb 2018 17:27:45 +0100 Subject: [PATCH 33/38] qemu-img: Make resize error message more general The issue: $ qemu-img resize -f qcow2 foo.qcow2 qemu-img: Expecting one image file name Try 'qemu-img --help' for more information So we gave an image file name, but we omitted the length. qemu-img thinks the last argument is always the size and removes it immediately from argv (by decrementing argc), and tries to verify that it is a valid size only at a later point. So we do not actually know whether that last argument we called "size" is indeed a size or whether the user instead forgot to specify that size but did give a file name. Therefore, the error message should be more general. Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1523458 Signed-off-by: Max Reitz Message-id: 20180205162745.23650-1-mreitz@redhat.com Reviewed-by: John Snow Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- qemu-img.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index 56edc152181d..aa99fd32e993 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3469,7 +3469,7 @@ static int img_resize(int argc, char **argv) } } if (optind != argc - 1) { - error_exit("Expecting one image file name"); + error_exit("Expecting image file name and size"); } filename = argv[optind++]; From 2b12a756ac2a5d3af5aa8116e3a3e62a1501ad61 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 14 Feb 2018 21:49:13 +0100 Subject: [PATCH 34/38] block/ssh: Pull ssh_grow_file() from ssh_create() If we ever want to offer even rudimentary truncation functionality for ssh, we should put the respective code into a reusable function. Signed-off-by: Max Reitz Message-id: 20180214204915.7980-2-mreitz@redhat.com Reviewed-by: Eric Blake Reviewed-by: Richard W.M. Jones Signed-off-by: Max Reitz --- block/ssh.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/block/ssh.c b/block/ssh.c index 36d5d888d57d..d6a68cb880bb 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -803,6 +803,26 @@ static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, return ret; } +static int ssh_grow_file(BDRVSSHState *s, int64_t offset, Error **errp) +{ + ssize_t ret; + char c[1] = { '\0' }; + + /* offset must be strictly greater than the current size so we do + * not overwrite anything */ + assert(offset > 0 && offset > s->attrs.filesize); + + libssh2_sftp_seek64(s->sftp_handle, offset - 1); + ret = libssh2_sftp_write(s->sftp_handle, c, 1); + if (ret < 0) { + sftp_error_setg(errp, s, "Failed to grow file"); + return -EIO; + } + + s->attrs.filesize = offset; + return 0; +} + static QemuOptsList ssh_create_opts = { .name = "ssh-create-opts", .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head), @@ -823,8 +843,6 @@ static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts, int64_t total_size = 0; QDict *uri_options = NULL; BDRVSSHState s; - ssize_t r2; - char c[1] = { '\0' }; ssh_state_init(&s); @@ -850,14 +868,10 @@ static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts, } if (total_size > 0) { - libssh2_sftp_seek64(s.sftp_handle, total_size-1); - r2 = libssh2_sftp_write(s.sftp_handle, c, 1); - if (r2 < 0) { - sftp_error_setg(errp, &s, "truncate failed"); - ret = -EINVAL; + ret = ssh_grow_file(&s, total_size, errp); + if (ret < 0) { goto out; } - s.attrs.filesize = total_size; } ret = 0; From bd8e0e32dac0cfb7c4e42b5a2d2b407819df049f Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 14 Feb 2018 21:49:14 +0100 Subject: [PATCH 35/38] block/ssh: Make ssh_grow_file() blocking At runtime (that is, during a future ssh_truncate()), the SSH session is non-blocking. However, ssh_truncate() (or rather, bdrv_truncate() in general) is not a coroutine, so this resize operation needs to block. For ssh_create(), that is fine, too; the session is never set to non-blocking anyway. Signed-off-by: Max Reitz Message-id: 20180214204915.7980-3-mreitz@redhat.com Reviewed-by: Eric Blake Reviewed-by: Richard W.M. Jones Signed-off-by: Max Reitz --- block/ssh.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/ssh.c b/block/ssh.c index d6a68cb880bb..4bcf10334f5e 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -803,17 +803,24 @@ static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, return ret; } +/* Note: This is a blocking operation */ static int ssh_grow_file(BDRVSSHState *s, int64_t offset, Error **errp) { ssize_t ret; char c[1] = { '\0' }; + int was_blocking = libssh2_session_get_blocking(s->session); /* offset must be strictly greater than the current size so we do * not overwrite anything */ assert(offset > 0 && offset > s->attrs.filesize); + libssh2_session_set_blocking(s->session, 1); + libssh2_sftp_seek64(s->sftp_handle, offset - 1); ret = libssh2_sftp_write(s->sftp_handle, c, 1); + + libssh2_session_set_blocking(s->session, was_blocking); + if (ret < 0) { sftp_error_setg(errp, s, "Failed to grow file"); return -EIO; From 624f3006b8a26bcf7a0b2be13265ac99c65fd117 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 14 Feb 2018 21:49:15 +0100 Subject: [PATCH 36/38] block/ssh: Add basic .bdrv_truncate() libssh2 does not seem to offer real truncation support, so we can only grow files -- but that is better than nothing. Signed-off-by: Max Reitz Message-id: 20180214204915.7980-4-mreitz@redhat.com Reviewed-by: Eric Blake Reviewed-by: Richard W.M. Jones Signed-off-by: Max Reitz --- block/ssh.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/block/ssh.c b/block/ssh.c index 4bcf10334f5e..80a8b40dfa73 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -1220,6 +1220,29 @@ static int64_t ssh_getlength(BlockDriverState *bs) return length; } +static int ssh_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) +{ + BDRVSSHState *s = bs->opaque; + + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_str(prealloc)); + return -ENOTSUP; + } + + if (offset < s->attrs.filesize) { + error_setg(errp, "ssh driver does not support shrinking files"); + return -ENOTSUP; + } + + if (offset == s->attrs.filesize) { + return 0; + } + + return ssh_grow_file(s, offset, errp); +} + static BlockDriver bdrv_ssh = { .format_name = "ssh", .protocol_name = "ssh", @@ -1232,6 +1255,7 @@ static BlockDriver bdrv_ssh = { .bdrv_co_readv = ssh_co_readv, .bdrv_co_writev = ssh_co_writev, .bdrv_getlength = ssh_getlength, + .bdrv_truncate = ssh_truncate, .bdrv_co_flush_to_disk = ssh_co_flush, .create_opts = &ssh_create_opts, }; From 9e029689e1228a218452d5c7b661ef208d22f71a Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 15 Feb 2018 15:10:08 +0200 Subject: [PATCH 37/38] qcow2: Replace align_offset() with ROUND_UP() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The align_offset() function is equivalent to the ROUND_UP() macro so there's no need to use the former. The ROUND_UP() name is also a bit more explicit. This patch uses ROUND_UP() instead of the slower QEMU_ALIGN_UP() because align_offset() already requires that the second parameter is a power of two. Signed-off-by: Alberto Garcia Reviewed-by: Eric Blake Reviewed-by: Philippe Mathieu-Daudé Message-id: 20180215131008.5153-1-berto@igalia.com Signed-off-by: Max Reitz --- block/qcow2-bitmap.c | 4 ++-- block/qcow2-cluster.c | 4 ++-- block/qcow2-refcount.c | 4 ++-- block/qcow2-snapshot.c | 10 +++++----- block/qcow2.c | 14 +++++++------- block/qcow2.h | 6 ------ 6 files changed, 18 insertions(+), 24 deletions(-) diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index 4f6fd863eabb..5127276f90b8 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -413,8 +413,8 @@ static inline void bitmap_dir_entry_to_be(Qcow2BitmapDirEntry *entry) static inline int calc_dir_entry_size(size_t name_size, size_t extra_data_size) { - return align_offset(sizeof(Qcow2BitmapDirEntry) + - name_size + extra_data_size, 8); + int size = sizeof(Qcow2BitmapDirEntry) + name_size + extra_data_size; + return ROUND_UP(size, 8); } static inline int dir_entry_size(Qcow2BitmapDirEntry *entry) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index e406b0f3b9ea..98908c426402 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -126,11 +126,11 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, new_l1_size2 = sizeof(uint64_t) * new_l1_size; new_l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(new_l1_size2, 512)); + ROUND_UP(new_l1_size2, 512)); if (new_l1_table == NULL) { return -ENOMEM; } - memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); + memset(new_l1_table, 0, ROUND_UP(new_l1_size2, 512)); if (s->l1_size) { memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index d46b69d7f348..126cca3276c4 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1204,7 +1204,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, * l1_table_offset when it is the current s->l1_table_offset! Be careful * when changing this! */ if (l1_table_offset != s->l1_table_offset) { - l1_table = g_try_malloc0(align_offset(l1_size2, 512)); + l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512)); if (l1_size2 && l1_table == NULL) { ret = -ENOMEM; goto fail; @@ -2553,7 +2553,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, } /* align range to test to cluster boundaries */ - size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); + size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size); offset = start_of_cluster(s, offset); if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 44243e0e95bf..cee25f582b98 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -66,7 +66,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) for(i = 0; i < s->nb_snapshots; i++) { /* Read statically sized part of the snapshot header */ - offset = align_offset(offset, 8); + offset = ROUND_UP(offset, 8); ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); if (ret < 0) { goto fail; @@ -155,7 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) offset = 0; for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; - offset = align_offset(offset, 8); + offset = ROUND_UP(offset, 8); offset += sizeof(h); offset += sizeof(extra); offset += strlen(sn->id_str); @@ -215,7 +215,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); h.id_str_size = cpu_to_be16(id_str_size); h.name_size = cpu_to_be16(name_size); - offset = align_offset(offset, 8); + offset = ROUND_UP(offset, 8); ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); if (ret < 0) { @@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) /* The VM state isn't needed any more in the active L1 table; in fact, it * hurts by causing expensive COW for the next snapshot. */ qcow2_cluster_discard(bs, qcow2_vm_state_offset(s), - align_offset(sn->vm_state_size, s->cluster_size), + ROUND_UP(sn->vm_state_size, s->cluster_size), QCOW2_DISCARD_NEVER, false); #ifdef DEBUG_ALLOC @@ -710,7 +710,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, } new_l1_bytes = sn->l1_size * sizeof(uint64_t); new_l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(new_l1_bytes, 512)); + ROUND_UP(new_l1_bytes, 512)); if (new_l1_table == NULL) { return -ENOMEM; } diff --git a/block/qcow2.c b/block/qcow2.c index 7cf3c1518a78..0397506b3908 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1379,7 +1379,7 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, if (s->l1_size > 0) { s->l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(s->l1_size * sizeof(uint64_t), 512)); + ROUND_UP(s->l1_size * sizeof(uint64_t), 512)); if (s->l1_table == NULL) { error_setg(errp, "Could not allocate L1 table"); ret = -ENOMEM; @@ -2642,19 +2642,19 @@ static int64_t qcow2_calc_prealloc_size(int64_t total_size, { int64_t meta_size = 0; uint64_t nl1e, nl2e; - int64_t aligned_total_size = align_offset(total_size, cluster_size); + int64_t aligned_total_size = ROUND_UP(total_size, cluster_size); /* header: 1 cluster */ meta_size += cluster_size; /* total size of L2 tables */ nl2e = aligned_total_size / cluster_size; - nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); + nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t)); meta_size += nl2e * sizeof(uint64_t); /* total size of L1 tables */ nl1e = nl2e * sizeof(uint64_t) / cluster_size; - nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); + nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t)); meta_size += nl1e * sizeof(uint64_t); /* total size of refcount table and blocks */ @@ -3710,8 +3710,8 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, has_backing_file = !!optstr; g_free(optstr); - virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - cluster_size); + virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + virtual_size = ROUND_UP(virtual_size, cluster_size); /* Check that virtual disk size is valid */ l2_tables = DIV_ROUND_UP(virtual_size / cluster_size, @@ -3731,7 +3731,7 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, goto err; } - virtual_size = align_offset(ssize, cluster_size); + virtual_size = ROUND_UP(ssize, cluster_size); if (has_backing_file) { /* We don't how much of the backing chain is shared by the input diff --git a/block/qcow2.h b/block/qcow2.h index 883802241fb6..1a84cc77b0b0 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -480,12 +480,6 @@ static inline int offset_to_l2_slice_index(BDRVQcow2State *s, int64_t offset) return (offset >> s->cluster_bits) & (s->l2_slice_size - 1); } -static inline int64_t align_offset(int64_t offset, int n) -{ - offset = (offset + n - 1) & ~(n - 1); - return offset; -} - static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s) { return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); From bfe1a14c180ec44c033be12b9151252ffda69292 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 5 Mar 2018 15:59:35 +0100 Subject: [PATCH 38/38] block: Fix NULL dereference on empty drive error blk_error_action() sends a BLOCK_IO_ERROR QMP event which includes the node name of its root node. If the BlockBackend represents an empty drive, there is no root node, so we should not try to access its node name. Make the field optional in the event and include it only when the BlockBackend isn't empty. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake --- block/block-backend.c | 5 +++-- qapi/block-core.json | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index a775a3dd2f73..a4421252f852 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1615,10 +1615,11 @@ static void send_qmp_error_event(BlockBackend *blk, bool is_read, int error) { IoOperationType optype; + BlockDriverState *bs = blk_bs(blk); optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(blk_name(blk), - bdrv_get_node_name(blk_bs(blk)), optype, + qapi_event_send_block_io_error(blk_name(blk), !!bs, + bs ? bdrv_get_node_name(bs) : NULL, optype, action, blk_iostatus_is_enabled(blk), error == ENOSPC, strerror(error), &error_abort); diff --git a/qapi/block-core.json b/qapi/block-core.json index 5c5921bfb707..00475f08d4e0 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3676,7 +3676,8 @@ # # @node-name: node name. Note that errors may be reported for the root node # that is directly attached to a guest device rather than for the -# node where the error occurred. (Since: 2.8) +# node where the error occurred. The node name is not present if +# the drive is empty. (Since: 2.8) # # @operation: I/O operation # @@ -3707,7 +3708,8 @@ # ## { 'event': 'BLOCK_IO_ERROR', - 'data': { 'device': 'str', 'node-name': 'str', 'operation': 'IoOperationType', + 'data': { 'device': 'str', '*node-name': 'str', + 'operation': 'IoOperationType', 'action': 'BlockErrorAction', '*nospace': 'bool', 'reason': 'str' } }