Skip to content

Commit a5909ac

Browse files
committed
Prime arc to reduce zil_replay & import times
The time it takes to import a zpool is dominated by the time it take to replay the zfs intent log (zil) when the zil is large. The zil is replayed serially, and some operations require read-modify-write to occur, for example TX_WRITE and TX_LINK entries. This commit reduces zil_replay times by reading the zil and issuing arc_read requests in parallel using a taskq prior to performing the serial zil_replay. Doing so can reduce pool import times from hours to minutes in cases where the zil has many TX_WRITE and TX_LINK entries. The benefit is particularly acute when the primary pool is stored on high-latency devices, which increases the cost of pool read-modify-write in serial zil_replay. Signed-off-by: Mark Roper <[email protected]>
1 parent b8c73ab commit a5909ac

File tree

17 files changed

+265
-20
lines changed

17 files changed

+265
-20
lines changed

cmd/ztest.c

+27-4
Original file line numberDiff line numberDiff line change
@@ -1261,6 +1261,29 @@ ztest_record_enospc(const char *s)
12611261
ztest_shared->zs_enospc_count++;
12621262
}
12631263

1264+
static zfs_replay_prime_arc_func_t *ztest_replay_prime_vector[TX_MAX_TYPE] = {
1265+
NULL, /* 0 no such transaction type */
1266+
NULL, /* TX_CREATE */
1267+
NULL, /* TX_MKDIR */
1268+
NULL, /* TX_MKXATTR */
1269+
NULL, /* TX_SYMLINK */
1270+
NULL, /* TX_REMOVE */
1271+
NULL, /* TX_RMDIR */
1272+
NULL, /* TX_LINK */
1273+
NULL, /* TX_RENAME */
1274+
NULL, /* TX_WRITE */
1275+
NULL, /* TX_TRUNCATE */
1276+
NULL, /* TX_SETATTR */
1277+
NULL, /* TX_ACL */
1278+
NULL, /* TX_CREATE_ACL */
1279+
NULL, /* TX_CREATE_ATTR */
1280+
NULL, /* TX_CREATE_ACL_ATTR */
1281+
NULL, /* TX_MKDIR_ACL */
1282+
NULL, /* TX_MKDIR_ATTR */
1283+
NULL, /* TX_MKDIR_ACL_ATTR */
1284+
NULL, /* TX_WRITE2 */
1285+
};
1286+
12641287
static uint64_t
12651288
ztest_get_ashift(void)
12661289
{
@@ -3010,7 +3033,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
30103033

30113034
/* zfsvfs_setup() */
30123035
VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog);
3013-
zil_replay(os, zd, ztest_replay_vector);
3036+
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);
30143037

30153038
(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
30163039
mutex_exit(&zd->zd_dirobj_lock);
@@ -4715,7 +4738,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
47154738
ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
47164739
B_TRUE, FTAG, &os) == 0) {
47174740
ztest_zd_init(zdtmp, NULL, os);
4718-
zil_replay(os, zdtmp, ztest_replay_vector);
4741+
zil_replay(os, zdtmp, ztest_replay_vector, ztest_replay_prime_vector);
47194742
ztest_zd_fini(zdtmp);
47204743
dmu_objset_disown(os, B_TRUE, FTAG);
47214744
}
@@ -7836,7 +7859,7 @@ ztest_dataset_open(int d)
78367859

78377860
ztest_dataset_dirobj_verify(zd);
78387861

7839-
zil_replay(os, zd, ztest_replay_vector);
7862+
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);
78407863

78417864
ztest_dataset_dirobj_verify(zd);
78427865

@@ -7883,7 +7906,7 @@ ztest_replay_zil_cb(const char *name, void *arg)
78837906
zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
78847907

78857908
ztest_zd_init(zdtmp, NULL, os);
7886-
zil_replay(os, zdtmp, ztest_replay_vector);
7909+
zil_replay(os, zdtmp, ztest_replay_vector, ztest_replay_prime_vector);
78877910
ztest_zd_fini(zdtmp);
78887911

78897912
if (dmu_objset_zil(os)->zl_parse_lr_count != 0 &&

include/os/freebsd/spl/sys/uio.h

+20
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,26 @@ zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
9393
zfs_uio_offset(uio) += size;
9494
}
9595

96+
static inline void
97+
zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
98+
unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid,
99+
size_t skip)
100+
{
101+
ASSERT(seg == UIO_SYSSPACE);
102+
103+
uio->uio_iov = iov;
104+
uio->uio_iovcnt = nr_segs;
105+
uio->uio_loffset = offset;
106+
uio->uio_segflg = seg;
107+
uio->uio_fault_disable = B_FALSE;
108+
uio->uio_fmode = 0;
109+
uio->uio_extflg = 0;
110+
uio->uio_resid = resid;
111+
uio->uio_skip = skip;
112+
uio->uio_soffset = uio->uio_loffset;
113+
memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
114+
}
115+
96116
static __inline void
97117
zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s)
98118
{

include/os/freebsd/zfs/sys/zfs_znode_impl.h

-2
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
176176
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
177177
extern void zfs_znode_free(struct znode *);
178178

179-
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
180-
181179
extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
182180
char *buf, uint64_t buflen);
183181

include/os/linux/zfs/sys/zfs_znode_impl.h

-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
159159
extern void zfs_inode_destroy(struct inode *);
160160
extern void zfs_mark_inode_dirty(struct inode *);
161161
extern boolean_t zfs_relatime_need_update(const struct inode *);
162-
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
163162

164163
#ifdef __cplusplus
165164
}

include/sys/dsl_pool.h

+2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ extern uint_t zfs_dirty_data_max_percent;
6464
extern uint_t zfs_dirty_data_max_max_percent;
6565
extern uint_t zfs_delay_min_dirty_percent;
6666
extern uint64_t zfs_delay_scale;
67+
extern int zfs_zil_replay_prime_arc;
6768

6869
/* These macros are for indexing into the zfs_all_blkstats_t. */
6970
#define DMU_OT_DEFERRED DMU_OT_NONE
@@ -137,6 +138,7 @@ typedef struct dsl_pool {
137138
txg_list_t dp_early_sync_tasks;
138139
taskq_t *dp_sync_taskq;
139140
taskq_t *dp_zil_clean_taskq;
141+
taskq_t *dp_zil_prime_taskq;
140142

141143
/*
142144
* Protects administrative changes (properties, namespace)

include/sys/zil.h

+13-1
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,21 @@ extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data,
598598
zil_sums_t *zil_sums);
599599
extern void zil_close(zilog_t *zilog);
600600

601+
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
602+
typedef void zfs_replay_prime_arc_func_t(void *args);
603+
extern zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE];
604+
typedef struct zil_replay_arg {
605+
zil_replay_func_t *const *zr_replay;
606+
zfs_replay_prime_arc_func_t *const *zr_replay_prime;
607+
void *zr_arg;
608+
boolean_t zr_byteswap;
609+
char *zr_lr;
610+
} zil_replay_arg_t;
601611
extern boolean_t zil_replay(objset_t *os, void *arg,
602-
zil_replay_func_t *const replay_func[TX_MAX_TYPE]);
612+
zil_replay_func_t *const replay_func[TX_MAX_TYPE],
613+
zfs_replay_prime_arc_func_t *const replay_prime_func[TX_MAX_TYPE]);
603614
extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
615+
604616
extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first);
605617
extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
606618

include/sys/zvol_impl.h

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ extern krwlock_t zvol_state_lock;
6565
extern struct hlist_head *zvol_htable;
6666
#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
6767
extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE];
68+
extern zfs_replay_prime_arc_func_t *const zvol_replay_prime_vector[TX_MAX_TYPE];
6869

6970
extern unsigned int zvol_volmode;
7071
extern unsigned int zvol_inhibit_dev;

man/man4/zfs.4

+9
Original file line numberDiff line numberDiff line change
@@ -2397,6 +2397,15 @@ The default value of
23972397
.Sy 100%
23982398
will create a maximum of one thread per cpu.
23992399
.
2400+
.It Sy zfs_zil_replay_prime_arc Ns = Ns Sy 0 Ns | Ns 1 Pq int
2401+
Controls whether zil_replay will read the zil and in parallel issue
2402+
zfs_read to prime the arc cache prior to performing real zil replay,
2403+
which is serial. Priming before replay can reduce zpool_import latency
2404+
by reducing zil_replay time for high latency pools. It does this by
2405+
eliminating serial read-modify-write cycles in zil_replay.
2406+
The value of 1 will perform arc_priming prior to zil_replay.
2407+
The deafult value of 0 will not perform arc priming prior to zil_replay.
2408+
.
24002409
.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
24012410
This sets the maximum block size used by the ZIL.
24022411
On very fragmented pools, lowering this

module/.tmp_23387/tmp

936 Bytes
Binary file not shown.

module/os/freebsd/zfs/zfs_vfsops.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1123,7 +1123,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
11231123
zfsvfs->z_use_namecache = B_FALSE;
11241124
zfsvfs->z_replay = B_TRUE;
11251125
zil_replay(zfsvfs->z_os, zfsvfs,
1126-
zfs_replay_vector);
1126+
zfs_replay_vector, zfs_replay_prime_vector);
11271127
zfsvfs->z_replay = B_FALSE;
11281128
zfsvfs->z_use_namecache = use_nc;
11291129
}

module/os/freebsd/zfs/zvol_os.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1488,7 +1488,8 @@ zvol_os_create_minor(const char *name)
14881488
if (zil_replay_disable)
14891489
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
14901490
else
1491-
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1491+
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
1492+
zvol_replay_prime_vector);
14921493
}
14931494
if (replayed_zil)
14941495
zil_close(zv->zv_zilog);

module/os/linux/zfs/zfs_vfsops.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
928928
} else {
929929
zfsvfs->z_replay = B_TRUE;
930930
zil_replay(zfsvfs->z_os, zfsvfs,
931-
zfs_replay_vector);
931+
zfs_replay_vector, zfs_replay_prime_vector);
932932
zfsvfs->z_replay = B_FALSE;
933933
}
934934
}

module/os/linux/zfs/zvol_os.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1700,7 +1700,8 @@ zvol_os_create_minor(const char *name)
17001700
if (zil_replay_disable)
17011701
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
17021702
else
1703-
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1703+
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
1704+
zvol_replay_prime_vector);
17041705
}
17051706
if (replayed_zil)
17061707
zil_close(zv->zv_zilog);

module/zfs/dsl_pool.c

+5
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
169169
static int zfs_zil_clean_taskq_nthr_pct = 100;
170170
static int zfs_zil_clean_taskq_minalloc = 1024;
171171
static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
172+
int zfs_zil_replay_prime_arc = 0;
172173

173174
int
174175
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
@@ -217,6 +218,10 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
217218
zfs_zil_clean_taskq_maxalloc,
218219
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
219220

221+
dp->dp_zil_prime_taskq = taskq_create("dp_zil_prime_taskq",
222+
100, minclsyspri, boot_ncpus, boot_ncpus * 2,
223+
TASKQ_THREADS_CPU_PCT);
224+
220225
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
221226
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
222227

module/zfs/zfs_replay.c

+106
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,39 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
676676
return (error);
677677
}
678678

679+
static void
680+
zfs_replay_prime_link(void *args)
681+
{
682+
zil_replay_arg_t *zr = args;
683+
lr_link_t *lr;
684+
zfsvfs_t *zfsvfs;
685+
znode_t *dzp, *zp;
686+
boolean_t byteswap;
687+
int error;
688+
689+
zfsvfs = (zfsvfs_t *)zr->zr_arg;
690+
lr = (lr_link_t *)zr->zr_lr;
691+
byteswap = (boolean_t)zr->zr_byteswap;
692+
if (byteswap)
693+
byteswap_uint64_array(lr, sizeof (*lr));
694+
695+
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) {
696+
cmn_err(CE_WARN, "Failed to get znode for link dir "
697+
"during replay prime: %d", error);
698+
return;
699+
}
700+
701+
if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
702+
cmn_err(CE_WARN, "Failed to get znode for link "
703+
"during replay prime: %d", error);
704+
zrele(dzp);
705+
return;
706+
}
707+
708+
zrele(zp);
709+
zrele(dzp);
710+
}
711+
679712
static int
680713
do_zfs_replay_rename(zfsvfs_t *zfsvfs, _lr_rename_t *lr, char *sname,
681714
char *tname, uint64_t rflags, vattr_t *wo_vap)
@@ -869,6 +902,52 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
869902
return (error);
870903
}
871904

905+
static void
906+
zfs_replay_prime_write(void *args)
907+
{
908+
fstrans_cookie_t cookie;
909+
zil_replay_arg_t *zr = args;
910+
zfsvfs_t *zfsvfs;
911+
lr_write_t *lr;
912+
znode_t *zp;
913+
uint64_t length;
914+
uint64_t offset;
915+
char *data;
916+
struct iovec iov;
917+
zfs_uio_t uio;
918+
boolean_t byteswap;
919+
920+
zfsvfs = (zfsvfs_t *)zr->zr_arg;
921+
lr = (lr_write_t *)zr->zr_lr;
922+
byteswap = (boolean_t)zr->zr_byteswap;
923+
if (byteswap)
924+
byteswap_uint64_array(lr, sizeof (*lr));
925+
926+
length = lr->lr_length % zfsvfs->z_max_blksz;
927+
if (length == 0)
928+
goto read_task_done;
929+
930+
offset = lr->lr_offset + (lr->lr_length - length);
931+
data = (char *)(lr + 1);
932+
iov.iov_base = (void *)data;
933+
iov.iov_len = length;
934+
zfs_uio_iovec_init(&uio, &iov, 1, offset, UIO_SYSSPACE, length, 0);
935+
936+
if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
937+
goto read_task_done;
938+
939+
cookie = spl_fstrans_mark();
940+
// Call zfs_read with the provided arguments
941+
zfs_read(zp, &uio, /* ioflags */ 0, kcred);
942+
spl_fstrans_unmark(cookie);
943+
944+
// Free the allocated memory
945+
zrele(zp);
946+
read_task_done:
947+
vmem_free(zr->zr_lr, sizeof (lr_write_t) + lr->lr_length);
948+
kmem_free(zr, sizeof (zil_replay_arg_t));
949+
}
950+
872951
/*
873952
* TX_WRITE2 are only generated when dmu_sync() returns EALREADY
874953
* meaning the pool block is already being synced. So now that we always write
@@ -1262,3 +1341,30 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
12621341
zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
12631342
zfs_replay_clone_range, /* TX_CLONE_RANGE */
12641343
};
1344+
1345+
/*
1346+
* Callback vectors for priming the arc for zil records
1347+
*/
1348+
zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE] = {
1349+
NULL, /* no such type */
1350+
NULL, /* TX_CREATE */
1351+
NULL, /* TX_MKDIR */
1352+
NULL, /* TX_MKXATTR */
1353+
NULL, /* TX_SYMLINK */
1354+
NULL, /* TX_REMOVE */
1355+
NULL, /* TX_RMDIR */
1356+
zfs_replay_prime_link, /* TX_LINK */
1357+
NULL, /* TX_RENAME */
1358+
zfs_replay_prime_write, /* TX_WRITE */
1359+
NULL, /* TX_TRUNCATE */
1360+
NULL, /* TX_SETATTR */
1361+
NULL, /* TX_ACL_V0 */
1362+
NULL, /* TX_ACL */
1363+
NULL, /* TX_CREATE_ACL */
1364+
NULL, /* TX_CREATE_ATTR */
1365+
NULL, /* TX_CREATE_ACL_ATTR */
1366+
NULL, /* TX_MKDIR_ACL */
1367+
NULL, /* TX_MKDIR_ATTR */
1368+
NULL, /* TX_MKDIR_ACL_ATTR */
1369+
NULL, /* TX_WRITE2 */
1370+
};

0 commit comments

Comments
 (0)