diff options
Diffstat (limited to 'fs')
116 files changed, 12407 insertions, 837 deletions
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c1e9f29c924c..f2d7402abe02 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 207ba8d627ca..a4b531be9168 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!result) - result = ERR_PTR(-ESTALE); - if (IS_ERR(result)) - return result; + if (PTR_ERR(result) == -ENOMEM) + return ERR_CAST(result); + if (IS_ERR_OR_NULL(result)) + return ERR_PTR(-ESTALE); if (d_is_dir(result)) { /* @@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); + if (err != -ENOMEM) + err = -ESTALE; return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh); diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 5f7b053720ee..6de15709d024 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_cache_defer_req_put(dreq); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 52a28311e2a4..532d8e242d4d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -31,8 +31,6 @@ struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - struct svc_rqst *rqst; - struct task_struct *task; }; static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; @@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp) return 0; } -/* - * Prepare to bring up the NFSv4 callback service - */ -static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) -{ - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); -} - #if defined(CONFIG_NFS_V4_1) /* * The callback service for NFSv4.1 callbacks @@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp) return 0; } -/* - * Bring up the NFSv4.1 callback service - */ -static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - - INIT_LIST_HEAD(&serv->sv_cb_list); - spin_lock_init(&serv->sv_cb_lock); - init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); - return rqstp; -} - -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = nfs41_callback_up(serv); - *callback_svc = nfs41_callback_svc; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = serv; } #else -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = ERR_PTR(-ENOTSUPP); - *callback_svc = ERR_PTR(-ENOTSUPP); -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { - struct svc_rqst *rqstp; - int (*callback_svc)(void *vrqstp); - struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int nrservs = nfs_callback_nr_threads; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); - if (cb_info->task) - return 0; + if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) + nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - switch (minorversion) { - case 0: - /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); - callback_svc = nfs4_callback_svc; - break; - default: - nfs_minorversion_callback_svc_setup(serv, - &rqstp, &callback_svc); - } - - if (IS_ERR(rqstp)) - return PTR_ERR(rqstp); - - svc_sock_update_bufs(serv); + if (serv->sv_nrthreads-1 == nrservs) + return 0; - cb_info->serv = serv; - cb_info->rqst = rqstp; - cb_info->task = kthread_create(callback_svc, cb_info->rqst, - "nfsv4.%u-svc", minorversion); - if (IS_ERR(cb_info->task)) { - ret = PTR_ERR(cb_info->task); - svc_exit_thread(cb_info->rqst); - cb_info->rqst = NULL; - cb_info->task = NULL; + ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + if (ret) { + serv->sv_ops->svo_setup(serv, NULL, 0); return ret; } - rqstp->rq_task = cb_info->task; - wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } @@ -281,19 +217,41 @@ err_bind: return ret; } -static struct svc_serv_ops nfs_cb_sv_ops = { +static struct svc_serv_ops nfs40_cb_sv_ops = { + .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; +#if defined(CONFIG_NFS_V4_1) +static struct svc_serv_ops nfs41_cb_sv_ops = { + .svo_function = nfs41_callback_svc, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = &nfs41_cb_sv_ops, +}; +#else +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = NULL, }; +#endif static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; struct svc_serv *serv; + struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. */ - if (cb_info->task) { + if (cb_info->serv) { /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. @@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) return cb_info->serv; } + switch (minorversion) { + case 0: + sv_ops = nfs4_cb_sv_ops[0]; + break; + default: + sv_ops = nfs4_cb_sv_ops[1]; + } + + if (sv_ops == NULL) + return ERR_PTR(-ENOTSUPP); + /* * Sanity check: if there's no task, * we should be the first user ... @@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); } + cb_info->serv = serv; /* As there is only one thread we need to over-ride the * default maximum of 80 connections */ @@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) * thread exits. */ err_net: + if (!cb_info->users) + cb_info->serv = NULL; svc_destroy(serv); err_create: mutex_unlock(&nfs_callback_mutex); @@ -374,18 +346,18 @@ err_start: void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; mutex_lock(&nfs_callback_mutex); - nfs_callback_down_net(minorversion, cb_info->serv, net); + serv = cb_info->serv; + nfs_callback_down_net(minorversion, serv, net); cb_info->users--; - if (cb_info->users == 0 && cb_info->task != NULL) { - kthread_stop(cb_info->task); - dprintk("nfs_callback_down: service stopped\n"); - svc_exit_thread(cb_info->rqst); + if (cb_info->users == 0) { + svc_get(serv); + serv->sv_ops->svo_setup(serv, NULL, 0); + svc_destroy(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; - cb_info->rqst = NULL; - cb_info->task = NULL; } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 5fe1cecbf9f0..c701c308fac5 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify( struct cb_devicenotifyargs *args, void *dummy, struct cb_process_state *cps); +struct cb_notify_lock_args { + struct nfs_fh cbnl_fh; + struct nfs_lowner cbnl_owner; + bool cbnl_valid; +}; + +extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, + void *dummy, + struct cb_process_state *cps); #endif /* CONFIG_NFS_V4_1 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, @@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net); #define NFS41_BC_MIN_CALLBACKS 1 #define NFS41_BC_MAX_CALLBACKS 1 +#define NFS4_MIN_NR_CALLBACK_THREADS 1 + extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_nr_threads; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f953ef6b2f2e..e9aa235e9d10 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -628,4 +628,20 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, + struct cb_process_state *cps) +{ + if (!cps->clp) /* set in cb_sequence */ + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + + dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + + return htonl(NFS4_OK); +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 656f68f7fe53..eb094c6011d8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -35,6 +35,7 @@ (1 + 3) * 4) // seqid, 3 slotids #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, return 0; } +static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 *p; + unsigned int len; + + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); + len = be32_to_cpu(*p); + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + /* Only try to decode if the length is right */ + if (len == 20) { + p += 2; /* skip "lock id:" */ + args->cbnl_owner.s_dev = be32_to_cpu(*p++); + xdr_decode_hyper(p, &args->cbnl_owner.id); + args->cbnl_valid = true; + } else { + args->cbnl_owner.s_dev = 0; + args->cbnl_owner.id = 0; + args->cbnl_valid = false; + } + return 0; +} + +static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->cbnl_fh); + if (unlikely(status != 0)) + goto out; + status = decode_lockowner(xdr, args); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL_SLOT: case OP_CB_LAYOUTRECALL: case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY_LOCK: *op = &callback_ops[op_nr]; break; @@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_WANTS_CANCELLED: - case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); default: @@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallslot_args, .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, }, + [OP_CB_NOTIFY_LOCK] = { + .process_op = (callback_process_op_t)nfs4_callback_notify_lock, + .decode_args = (callback_decode_arg_t)decode_notify_lock_args, + .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e106780a237..7555ba889d1f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) - continue; + /* Match all xprt_switch full socket addresses */ + if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, + sap)) + continue; atomic_inc(&clp->cl_count); return clp; @@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; - fsinfo.layouttype = 0; + fsinfo.nlayouttypes = 0; + memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); - nn->boot_time = CURRENT_TIME; + nn->boot_time = ktime_get_real(); } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 322c2585bc34..dff600ae0d74 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static bool +nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t flags) +{ + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return true; + return false; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags && - !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (nfs4_is_valid_delegation(delegation, flags)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; @@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, rcu_read_unlock(); put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); - } else { - /* We appear to have raced with a delegation return. */ - spin_unlock(&delegation->lock); - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + return; } - } else { - rcu_read_unlock(); + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); } + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } -static void nfs_revoke_delegation(struct inode *inode) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; + nfs_mark_return_delegation(server, delegation); +} + +static bool nfs_revoke_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; + nfs4_stateid tmp; + bool ret = false; + rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - nfs_mark_return_delegation(NFS_SERVER(inode), delegation); - } + if (delegation == NULL) + goto out; + if (stateid == NULL) { + nfs4_stateid_copy(&tmp, &delegation->stateid); + stateid = &tmp; + } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) + goto out; + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + ret = true; +out: rcu_read_unlock(); + if (ret) + nfs_inode_find_state_and_recover(inode, stateid); + return ret; } -void nfs_remove_bad_delegation(struct inode *inode) +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; - nfs_revoke_delegation(inode); + if (!nfs_revoke_delegation(inode, stateid)) + return; delegation = nfs_inode_detach_delegation(inode); - if (delegation) { - nfs_inode_find_state_and_recover(inode, &delegation->stateid); + if (delegation) nfs_free_delegation(delegation); - } } EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); @@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) { struct nfs_delegation *delegation; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + /* + * If the delegation may have been admin revoked, then we + * cannot reclaim it. + */ + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) + continue; set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + } } /** @@ -851,6 +887,141 @@ restart: rcu_read_unlock(); } +static inline bool nfs4_server_rebooted(const struct nfs_client *clp) +{ + return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) | + BIT(NFS4CLNT_LEASE_EXPIRED) | + BIT(NFS4CLNT_SESSION_RESET))) != 0; +} + +static void nfs_mark_test_expired_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE) + return; + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); +} + +static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server, + struct inode *inode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + nfs_mark_test_expired_delegation(server, delegation); + rcu_read_unlock(); + +} + +static void nfs_delegation_mark_test_expired_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + nfs_mark_test_expired_delegation(server, delegation); +} + +/** + * nfs_mark_test_expired_all_delegations - mark all delegations for testing + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * marks them as needing to be checked for validity. + */ +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_test_expired_server(server); + rcu_read_unlock(); +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + const struct nfs4_minor_version_ops *ops = clp->cl_mvops; + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + struct rpc_cred *cred; + nfs4_stateid stateid; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + cred = get_rpccred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + if (cred != NULL && + ops->test_and_free_expired(server, &stateid, cred) < 0) { + nfs_revoke_delegation(inode, &stateid); + nfs_inode_find_state_and_recover(inode, &stateid); + } + put_rpccred(cred); + if (nfs4_server_rebooted(clp)) { + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + nfs_sb_deactive(server->super); + return; + } + iput(inode); + nfs_sb_deactive(server->super); + goto restart; + } + } + rcu_read_unlock(); +} + +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; + bool found = false; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation && + nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); + found = true; + } + rcu_read_unlock(); + if (found) + nfs4_schedule_state_manager(clp); +} + /** * nfs_delegations_present - check for existence of delegations * @clp: client state handle @@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - ret = (delegation != NULL && (delegation->type & flags) == flags); + ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 64724d252a79..e9d555796873 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -32,6 +32,7 @@ enum { NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, + NFS_DELEGATION_TEST_EXPIRED, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); -void nfs_remove_bad_delegation(struct inode *inode); +void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_reap_expired_delegations(struct nfs_client *clp); + /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); @@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 06e0bf092ba9..5f1af4cd1a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 0; nfsi = NFS_I(inode); - if (entry->fattr->fileid == nfsi->fileid) - return 1; - if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) - return 1; - return 0; + if (entry->fattr->fileid != nfsi->fileid) + return 0; + if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) + return 0; + return 1; } static @@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; + if (filename.len == 0) + return; + /* Validate that the name doesn't contain any illegal '\0' */ + if (strnlen(filename.name, filename.len) != filename.len) + return; + /* ...or '/' */ + if (strnchr(filename.name, filename.len, '/')) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -517,6 +525,8 @@ again: &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { + if (!entry->fh->size) + goto out; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) @@ -529,6 +539,10 @@ again: goto again; } } + if (!entry->fh->size) { + d_lookup_done(dentry); + goto out; + } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); alias = d_splice_alias(inode, dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 72b7d13ee3c6..bd81bcf3ffcf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) dreq->iocb->ki_complete(dreq->iocb, res, 0); } - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_direct_req_release(dreq); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2efbdde36c3e..9ea85ae23c32 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -520,7 +520,9 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, +#ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, +#endif .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, @@ -685,11 +687,6 @@ out_noconflict: goto out; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) -{ - return locks_lock_file_wait(file, fl); -} - static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -722,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); return status; } @@ -747,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); if (status < 0) goto out; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 51b51369704c..98ace127bf86 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, NULL); case -NFS4ERR_OPENMODE: if (state == NULL) break; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a6acce663219..80bcc0befb07 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -534,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) } #endif - #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -#else -#define nfs_migrate_page NULL #endif static inline int @@ -562,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, @@ -571,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp, extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); +extern int nfs4_test_session_trunk(struct rpc_clnt *, + struct rpc_xprt *, + void *); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index f0e06e4acbef..fbce0d885d4c 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,7 +29,7 @@ struct nfs_net { int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; - struct timespec boot_time; + ktime_t boot_time; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 64b43b4ad9dd..608501971fe0 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); + rpc_put_task(task); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9bf64eacba5b..9b3a82abab07 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -39,6 +39,7 @@ enum nfs4_client_state { NFS4CLNT_BIND_CONN_TO_SESSION, NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, + NFS4CLNT_DELEGATION_EXPIRED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -57,8 +58,11 @@ struct nfs4_minor_version_ops { struct nfs_fsinfo *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + int (*test_and_free_expired)(struct nfs_server *, + nfs4_stateid *, struct rpc_cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -156,6 +160,7 @@ enum { NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ + NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ }; struct nfs4_state { @@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops { struct rpc_cred *); }; +struct nfs4_add_xprt_data { + struct nfs_client *clp; + struct rpc_cred *cred; +}; + struct nfs4_state_maintenance_ops { int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); @@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); +extern int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt); static inline bool is_ds_only_client(struct nfs_client *clp) @@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern int nfs4_schedule_migration_recovery(const struct nfs_server *); extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); -extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); @@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4; struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; +extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index cd3b7cfdde16..074ac7131459 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: @@ -562,15 +565,15 @@ out: /* * Returns true if the client IDs match */ -static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +static bool nfs4_match_clientids(u64 a, u64 b) { - if (a->cl_clientid != b->cl_clientid) { + if (a != b) { dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return false; } dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return true; } @@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) * Returns true if the server major ids match */ static bool -nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) +nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) { - struct nfs41_server_owner *o1 = a->cl_serverowner; - struct nfs41_server_owner *o2 = b->cl_serverowner; - if (o1->major_id_sz != o2->major_id_sz) goto out_major_mismatch; if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) goto out_major_mismatch; - dprintk("NFS: --> %s server owners match\n", __func__); + dprintk("NFS: --> %s server owner major IDs match\n", __func__); return true; out_major_mismatch: @@ -597,6 +598,100 @@ out_major_mismatch: return false; } +/* + * Returns true if server minor ids match + */ +static bool +nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) +{ + /* Check eir_server_owner so_minor_id */ + if (o1->minor_id != o2->minor_id) + goto out_minor_mismatch; + + dprintk("NFS: --> %s server owner minor IDs match\n", __func__); + return true; + +out_minor_mismatch: + dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); + return false; +} + +/* + * Returns true if the server scopes match + */ +static bool +nfs4_check_server_scope(struct nfs41_server_scope *s1, + struct nfs41_server_scope *s2) +{ + if (s1->server_scope_sz != s2->server_scope_sz) + goto out_scope_mismatch; + if (memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) != 0) + goto out_scope_mismatch; + + dprintk("NFS: --> %s server scopes match\n", __func__); + return true; + +out_scope_mismatch: + dprintk("NFS: --> %s server scopes do not match\n", + __func__); + return false; +} + +/** + * nfs4_detect_session_trunking - Checks for session trunking. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. + * + * @clp: original mount nfs_client + * @res: result structure from an exchange_id using the original mount + * nfs_client with a new multi_addr transport + * + * Returns zero on success, otherwise -EINVAL + * + * Note: since the exchange_id for the new multi_addr transport uses the + * same nfs_client from the original mount, the cl_owner_id is reused, + * so eir_clientowner is the same. + */ +int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, + struct rpc_xprt *xprt) +{ + /* Check eir_clientid */ + if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + goto out_err; + + /* Check eir_server_owner so_major_id */ + if (!nfs4_check_serverowner_major_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_owner so_minor_id */ + if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_scope */ + if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) + goto out_err; + + /* Session trunking passed, add the xprt */ + rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); + + pr_info("NFS: %s: Session trunking succeeded for %s\n", + clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return 0; +out_err: + pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return -EINVAL; +} + /** * nfs41_walk_client_list - Find nfs_client that matches a client/server owner * @@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (!nfs4_match_clientids(pos, new)) + if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) continue; /* @@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new, * client id trunking. In either case, we want to fall back * to using the existing nfs_client. */ - if (!nfs4_check_clientid_trunking(pos, new)) + if (!nfs4_check_serverowner_major_id(pos->cl_serverowner, + new->cl_serverowner)) continue; /* Unlike NFSv4.0, we know that NFSv4.1 always uses the diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0e327528a3ce..ad917bd72b38 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, - struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, + struct rpc_cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_test_and_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + + ops->test_and_free_expired(server, stateid, cred); +} + +static void __nfs4_free_revoked_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + stateid->type = NFS4_REVOKED_STATEID_TYPE; + nfs4_test_and_free_stateid(server, stateid, cred); +} + +static void nfs4_free_revoked_stateid(struct nfs_server *server, + const nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + nfs4_stateid tmp; + + nfs4_stateid_copy(&tmp, stateid); + __nfs4_free_revoked_stateid(server, &tmp, cred); +} + static long nfs4_update_delay(long *timeout) { long ret; @@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->delay = 0; exception->recovering = 0; exception->retry = 0; + + if (stateid == NULL && state != NULL) + stateid = &state->stateid; + switch(errorcode) { case 0: return 0; - case -NFS4ERR_OPENMODE: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && stateid != NULL) { + nfs_inode_find_state_and_recover(inode, + stateid); + goto wait_on_recovery; + } + case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server, if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_EXPIRED: - if (state != NULL) { - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; @@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* If previous op on slot was interrupted and we reused + * the seq# and got a reply from the cache, then retry + */ + if (task->tk_status == -EREMOTEIO && interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, + !!slot->privileged); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; dprintk("<-- %s slotid=%u seqid=%u\n", __func__, @@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } +#ifdef CONFIG_NFS_V4_1 +static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) +{ + if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags)) + return true; + if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags)) + return true; + if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags)) + return true; + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) } static bool nfs_need_update_open_stateid(struct nfs4_state *state, - nfs4_stateid *stateid) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) return true; if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); nfs_test_and_clear_all_open_stateid(state); return true; } @@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_schedule_state_manager(state->owner->so_server->nfs_client); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_set_open_stateid_locked(struct nfs4_state *state, + const nfs4_stateid *stateid, fmode_t fmode, + nfs4_stateid *freeme) { switch (fmode) { case FMODE_READ: @@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (!nfs_need_update_open_stateid(state, stateid)) + if (!nfs_need_update_open_stateid(state, stateid, freeme)) return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); } -static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +static void __update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode, + nfs4_stateid *freeme) { /* * Protect the call to nfs4_state_set_mode_locked and @@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode); + nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); write_sequnlock(&state->seqlock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +static int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *delegation, + fmode_t fmode) { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; + nfs4_stateid freeme = {0}; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); @@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, + fmode, &freeme); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); @@ -1508,11 +1576,14 @@ no_delegation: rcu_read_unlock(); if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode); + __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); ret = 1; } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) - nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + nfs4_schedule_state_manager(clp); + if (freeme.type != 0) + nfs4_test_and_free_stateid(server, &freeme, + state->owner->so_cred); return ret; } @@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: nfs_inode_find_state_and_recover(state->inode, @@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) { - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, stateid); write_seqlock(&state->seqlock); nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); @@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) static void nfs40_clear_delegation_stateid(struct nfs4_state *state) { if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) - nfs_finish_clear_delegation_stateid(state); + nfs_finish_clear_delegation_stateid(state, NULL); } static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st return nfs4_open_expired(sp, state); } +static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + return -NFS4ERR_BAD_STATEID; +} + #if defined(CONFIG_NFS_V4_1) +static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + + switch (stateid->type) { + default: + break; + case NFS4_INVALID_STATEID_TYPE: + case NFS4_SPECIAL_STATEID_TYPE: + return -NFS4ERR_BAD_STATEID; + case NFS4_REVOKED_STATEID_TYPE: + goto out_free; + } + + status = nfs41_test_stateid(server, stateid, cred); + switch (status) { + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + break; + default: + return status; + } +out_free: + /* Ack the revoked state to the server */ + nfs41_free_stateid(server, stateid, cred, true); + return -NFS4ERR_EXPIRED; +} + static void nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + rcu_read_unlock(); + nfs_finish_clear_delegation_stateid(state, &stateid); + return; + } + + if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + rcu_read_unlock(); + return; + } + cred = get_rpccred(delegation->cred); rcu_read_unlock(); - status = nfs41_test_stateid(server, &stateid, cred); + status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); - - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, &stateid, cred); - nfs_finish_clear_delegation_stateid(state); - } + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_finish_clear_delegation_stateid(state, &stateid); put_rpccred(cred); } /** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = NFS_OK; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + if (!test_bit(LK_STATE_IN_USE, &state->flags)) + goto out; + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_and_free_expired_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status == -NFS4ERR_EXPIRED || + status == -NFS4ERR_BAD_STATEID) { + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE; + if (!recover_lost_locks) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } else if (status != NFS_OK) { + ret = status; + break; + } + } + }; +out: + return ret; +} + +/** * nfs41_check_open_stateid - possibly free an open stateid * * @state: NFSv4 state for an inode @@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) struct rpc_cred *cred = state->owner->so_cred; int status; - /* If a state reset has been done, test_stateid is unneeded */ - if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { + if (nfs4_have_delegation(state->inode, state->state)) + return NFS_OK; + return -NFS4ERR_OPENMODE; + } return -NFS4ERR_BAD_STATEID; - - status = nfs41_test_stateid(server, stateid, cred); + } + status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid, cred); - + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); + stateid->type = NFS4_INVALID_STATEID_TYPE; } - return status; + if (status != NFS_OK) + return status; + if (nfs_open_stateid_recover_openmode(state)) + return -NFS4ERR_OPENMODE; + return NFS_OK; } static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st int status; nfs41_check_delegation_stateid(state); + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + return status; status = nfs41_check_open_stateid(state); if (status != NFS_OK) status = nfs4_open_expired(sp, state); @@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) + set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &state->open_stateid)) { rpc_restart_call_prepare(task); @@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; @@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) -{ - nfs_invalidate_atime(hdr->inode); -} - static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct nfs_server *server = NFS_SERVER(hdr->inode); trace_nfs4_read(hdr, task->tk_status); - if (nfs4_async_handle_error(task, server, - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } - __nfs4_read_done_cb(hdr); if (task->tk_status > 0) renew_lease(server, hdr->timestamp); return 0; @@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (task->tk_status > 0) + nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } @@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct inode *inode = hdr->inode; trace_nfs4_write(hdr, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + NFS_SERVER(inode), task->tk_status, + &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); @@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { /* An impossible timestamp guarantees this value * will never match a generated boot time. */ - verf[0] = 0; - verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + verf[0] = cpu_to_be32(U32_MAX); + verf[1] = cpu_to_be32(U32_MAX); } else { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - verf[0] = cpu_to_be32(nn->boot_time.tv_sec); - verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + u64 ns = ktime_to_ns(nn->boot_time); + + verf[0] = cpu_to_be32(ns >> 32); + verf[1] = cpu_to_be32(ns); } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) renew_lease(data->res.server, data->timestamp); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(data->res.server, + data->args.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: task->tk_status = 0; if (data->roc) pnfs_roc_set_barrier(data->inode, data->roc_barrier); @@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 return err; } -#define NFS4_LOCK_MINTIMEOUT (1 * HZ) -#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) - -/* - * sleep, with exponential backoff, and retry the LOCK operation. - */ -static unsigned long -nfs4_set_lock_task_retry(unsigned long timeout) -{ - freezable_schedule_timeout_killable_unsafe(timeout); - timeout <<= 1; - if (timeout > NFS4_LOCK_MAXTIMEOUT) - return NFS4_LOCK_MAXTIMEOUT; - return timeout; -} - static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct inode *inode = state->inode; @@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct inode *inode, struct file_lock *fl) -{ - return locks_lock_inode_wait(inode, fl); -} - struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); + locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(calldata->server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &calldata->lsp->ls_stateid)) rpc_restart_call_prepare(task); @@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(inode, request) == -ENOENT) { + if (locks_lock_inode_wait(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ { switch (error) { case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || @@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ break; case -NFS4ERR_STALE_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -6083,52 +6247,19 @@ out: } #if defined(CONFIG_NFS_V4_1) -/** - * nfs41_check_expired_locks - possibly free a lock stateid - * - * @state: NFSv4 state for an inode - * - * Returns NFS_OK if recovery for this stateid is now finished. - * Otherwise a negative NFS4ERR value is returned. - */ -static int nfs41_check_expired_locks(struct nfs4_state *state) -{ - int status, ret = -NFS4ERR_BAD_STATEID; - struct nfs4_lock_state *lsp; - struct nfs_server *server = NFS_SERVER(state->inode); - - list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - - status = nfs41_test_stateid(server, - &lsp->ls_stateid, - cred); - trace_nfs4_test_lock_stateid(state, lsp, status); - if (status != NFS_OK) { - /* Free the stateid unless the server - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, - &lsp->ls_stateid, - cred); - clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - ret = status; - } - } - }; - - return ret; -} - static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) { - int status = NFS_OK; + struct nfs4_lock_state *lsp; + int status; - if (test_bit(LK_STATE_IN_USE, &state->flags)) - status = nfs41_check_expired_locks(state); - if (status != NFS_OK) - status = nfs4_lock_expired(state, request); + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || + test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + return 0; + status = nfs4_lock_expired(state, request); return status; } #endif @@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; - int status = -ENOLCK; + int status; - if ((fl_flags & FL_POSIX) && - !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) - goto out; - /* Is this a delegated open? */ - status = nfs4_set_lock_state(state, request); - if (status != 0) - goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; mutex_lock(&sp->so_delegreturn_mutex); @@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +static int +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + freezable_schedule_timeout_interruptible(timeout); + timeout *= 2; + timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); + status = -ERESTARTSYS; + } + return status; +} + +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { struct nfs_open_context *ctx; struct nfs4_state *state; - unsigned long timeout = NFS4_LOCK_MINTIMEOUT; int status; /* verify open state */ @@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + + if ((request->fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + return -ENOLCK; + /* * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. @@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return -EBADF; } - do { - status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) - break; - timeout = nfs4_set_lock_task_retry(timeout); - status = -ERESTARTSYS; - if (signalled()) - break; - } while(status < 0); - return status; + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + + return nfs4_retry_setlk(state, cmd, request); } int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) @@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, return 0; } +struct nfs41_exchange_id_data { + struct nfs41_exchange_id_res res; + struct nfs41_exchange_id_args args; + struct rpc_xprt *xprt; + int rpc_status; +}; + +static void nfs4_exchange_id_done(struct rpc_task *task, void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + struct nfs_client *clp = cdata->args.client; + int status = task->tk_status; + + trace_nfs4_exchange_id(clp, status); + + if (status == 0) + status = nfs4_check_cl_exchange_flags(cdata->res.flags); + + if (cdata->xprt && status == 0) { + status = nfs4_detect_session_trunking(clp, &cdata->res, + cdata->xprt); + goto out; + } + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); + + if (status == 0) { + clp->cl_clientid = cdata->res.clientid; + clp->cl_exchange_flags = cdata->res.flags; + /* Client ID is not confirmed */ + if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + clp->cl_seqid = cdata->res.seqid; + } + + kfree(clp->cl_serverowner); + clp->cl_serverowner = cdata->res.server_owner; + cdata->res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = cdata->res.impl_id; + cdata->res.impl_id = NULL; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + cdata->res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = cdata->res.server_scope; + cdata->res.server_scope = NULL; + } + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + sizeof(clp->cl_confirm.data)); + } +out: + cdata->rpc_status = status; + return; +} + +static void nfs4_exchange_id_release(void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + + nfs_put_client(cdata->args.client); + if (cdata->xprt) { + xprt_put(cdata->xprt); + rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); + } + kfree(cdata->res.impl_id); + kfree(cdata->res.server_scope); + kfree(cdata->res.server_owner); + kfree(cdata); +} + +static const struct rpc_call_ops nfs4_exchange_id_call_ops = { + .rpc_call_done = nfs4_exchange_id_done, + .rpc_release = nfs4_exchange_id_release, +}; + /* * _nfs4_proc_exchange_id() * * Wrapper for EXCHANGE_ID operation. */ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, - u32 sp4_how) + u32 sp4_how, struct rpc_xprt *xprt) { nfs4_verifier verifier; - struct nfs41_exchange_id_args args = { - .verifier = &verifier, - .client = clp, -#ifdef CONFIG_NFS_V4_1_MIGRATION - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, -#endif - }; - struct nfs41_exchange_id_res res = { - 0 - }; - int status; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], - .rpc_argp = &args, - .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .callback_ops = &nfs4_exchange_id_call_ops, + .rpc_message = &msg, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + struct nfs41_exchange_id_data *calldata; + struct rpc_task *task; + int status = -EIO; + + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out; + + status = -ENOMEM; + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (!calldata) + goto out; - nfs4_init_boot_verifier(clp, &verifier); + if (!xprt) + nfs4_init_boot_verifier(clp, &verifier); status = nfs4_init_uniform_client_string(clp); if (status) - goto out; + goto out_calldata; dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), - GFP_NOFS); - if (unlikely(res.server_owner == NULL)) { - status = -ENOMEM; - goto out; - } + calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + status = -ENOMEM; + if (unlikely(calldata->res.server_owner == NULL)) + goto out_calldata; - res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), GFP_NOFS); - if (unlikely(res.server_scope == NULL)) { - status = -ENOMEM; + if (unlikely(calldata->res.server_scope == NULL)) goto out_server_owner; - } - res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); - if (unlikely(res.impl_id == NULL)) { - status = -ENOMEM; + calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(calldata->res.impl_id == NULL)) goto out_server_scope; - } switch (sp4_how) { case SP4_NONE: - args.state_protect.how = SP4_NONE; + calldata->args.state_protect.how = SP4_NONE; break; case SP4_MACH_CRED: - args.state_protect = nfs4_sp4_mach_cred_request; + calldata->args.state_protect = nfs4_sp4_mach_cred_request; break; default: @@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, status = -EINVAL; goto out_impl_id; } + if (xprt) { + calldata->xprt = xprt; + task_setup_data.rpc_xprt = xprt; + task_setup_data.flags = + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; + calldata->args.verifier = &clp->cl_confirm; + } else { + calldata->args.verifier = &verifier; + } + calldata->args.client = clp; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - trace_nfs4_exchange_id(clp, status); - if (status == 0) - status = nfs4_check_cl_exchange_flags(res.flags); - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &res.state_protect); - - if (status == 0) { - clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = res.flags; - /* Client ID is not confirmed */ - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - clp->cl_seqid = res.seqid; - } - - kfree(clp->cl_serverowner); - clp->cl_serverowner = res.server_owner; - res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = res.impl_id; - res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = res.server_scope; - res.server_scope = NULL; - } + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_impl_id; } -out_impl_id: - kfree(res.impl_id); -out_server_scope: - kfree(res.server_scope); -out_server_owner: - kfree(res.server_owner); + if (!xprt) { + status = rpc_wait_for_completion_task(task); + if (!status) + status = calldata->rpc_status; + } else /* session trunking test */ + status = calldata->rpc_status; + + rpc_put_task(task); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7240,6 +7548,16 @@ out: clp->cl_implid->date.nseconds); dprintk("NFS reply exchange_id: %d\n", status); return status; + +out_impl_id: + kfree(calldata->res.impl_id); +out_server_scope: + kfree(calldata->res.server_scope); +out_server_owner: + kfree(calldata->res.server_owner); +out_calldata: + kfree(calldata); + goto out; } /* @@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); +} + +/** + * nfs4_test_session_trunk + * + * This is an add_xprt_test() test function called from + * rpc_clnt_setup_test_and_add_xprt. + * + * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt + * and is dereferrenced in nfs4_exchange_id_release + * + * Upon success, add the new transport to the rpc_clnt + * + * @clnt: struct rpc_clnt to get new transport + * @xprt: the rpc_xprt to test + * @data: call data for _nfs4_proc_exchange_id. + */ +int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, + void *data) +{ + struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + u32 sp4_how; + + dprintk("--> %s try %s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); + + /* Test connection for session trunking. Async exchange_id call */ + return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); } +EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; - args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; + args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1); dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; - /* These would render the backchannel useless: */ - if (rcvd->max_ops != sent->max_ops) + if (rcvd->max_ops > sent->max_ops) return -EINVAL; - if (rcvd->max_reqs != sent->max_reqs) + if (rcvd->max_reqs > sent->max_reqs) return -EINVAL; out: return 0; @@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_RECALLCONFLICT: status = -ERECALLCONFLICT; break; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; @@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; + exception->stateid = &lgp->args.stateid; break; } @@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server, return -res.status; } +static void nfs4_handle_delay_or_session_error(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + exception->retry = 0; + switch(err) { + case -NFS4ERR_DELAY: + case -NFS4ERR_RETRY_UNCACHED_REP: + nfs4_handle_exception(server, err, exception); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_do_handle_exception(server, err, exception); + } +} + /** * nfs41_test_stateid - perform a TEST_STATEID operation * @@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server, int err; do { err = _nfs41_test_stateid(server, stateid, cred); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + nfs4_handle_delay_or_session_error(server, err, &exception); } while (exception.retry); return err; } @@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { }; static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, struct rpc_cred *cred, bool privileged) { @@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); if (privileged) nfs4_set_sequence_privileged(&data->args.seq_args); @@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential + * @is_recovery: set to true if this call needs to be privileged * - * Returns NFS_OK if the server freed "stateid". Otherwise a - * negative NFS4ERR value is returned. + * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + struct rpc_cred *cred, + bool is_recovery) { struct rpc_task *task; - int ret; - task = _nfs41_free_stateid(server, stateid, cred, true); + task = _nfs41_free_stateid(server, stateid, cred, is_recovery); if (IS_ERR(task)) return PTR_ERR(task); - ret = rpc_wait_for_completion_task(task); - if (!ret) - ret = task->tk_status; rpc_put_task(task); - return ret; + return 0; } static void nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct rpc_task *task; struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); - if (IS_ERR(task)) - return; - rpc_put_task(task); } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .test_and_free_expired = nfs40_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_seqid, .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, @@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, .call_sync_ops = &nfs41_call_sync_ops, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index f703b755351b..dae385500005 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -9,6 +9,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -22,6 +23,7 @@ struct nfs4_slot { u32 slot_nr; u32 seq_nr; unsigned int interrupted : 1, + privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cada00aa5096..5f4281ec5f72 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, { int ret; + if (!nfs4_valid_open_stateid(state)) + return -EIO; if (cred != NULL) *cred = NULL; ret = nfs4_copy_lock_stateid(dst, state, lockowner); @@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp) static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); /* Don't recover state that expired before the reboot */ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { @@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); @@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ { struct nfs_client *clp = server->nfs_client; - if (!nfs4_valid_open_stateid(state)) + if (!nfs4_state_mark_reclaim_nograce(clp, state)) return -EBADF; - nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); @@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); +static struct nfs4_lock_state * +nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + struct nfs4_lock_state *pos; + + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) + continue; + if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + return pos; + } + return NULL; +} + +static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + bool found = false; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) { + spin_lock(&state->state_lock); + if (nfs_state_find_lock_state_by_stateid(state, stateid)) + found = true; + spin_unlock(&state->state_lock); + } + return found; +} + void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid) { @@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + if (nfs4_stateid_match_other(&state->stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; continue; - if (!nfs4_stateid_match(&state->stateid, stateid)) - continue; - nfs4_state_mark_reclaim_nograce(clp, state); - found = true; + } + if (nfs_state_lock_state_matches_stateid(state, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) + found = true; } spin_unlock(&inode->i_lock); + + nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) nfs4_schedule_state_manager(clp); } @@ -1498,6 +1536,9 @@ restart: __func__, status); case -ENOENT: case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: case -ESTALE: /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); @@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) put_rpccred(cred); } -static void nfs_delegation_clear_all(struct nfs_client *clp) -{ - nfs_delegation_mark_reclaim(clp); - nfs_delegation_reap_unclaimed(clp); -} - static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) { - nfs_delegation_clear_all(clp); + nfs_mark_test_expired_all_delegations(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } @@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp) static void nfs41_handle_some_state_revoked(struct nfs_client *clp) { - nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_state_start_reclaim_nograce(clp); nfs4_schedule_state_manager(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); @@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, + bool recovery) { if (!flags) return; dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", __func__, clp->cl_hostname, clp->cl_clientid, flags); + /* + * If we're called from the state manager thread, then assume we're + * already handling the RECLAIM_NEEDED and/or STATE_REVOKED. + * Those flags are expected to remain set until we're done + * recovering (see RFC5661, section 18.46.3). + */ + if (recovery) + goto out_recovery; if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); @@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); +out_recovery: if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) nfs41_handle_backchannel_fault(clp); else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | @@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_state_end_reclaim_reboot(clp); } + /* Detect expired delegations... */ + if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { + section = "detect expired delegations"; + nfs_reap_expired_delegations(clp); + continue; + } + /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7bd3a5c09d31..fc89e5ed07ee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, } /* - * Decode potentially multiple layout types. Currently we only support - * one layout driver per file system. + * Decode potentially multiple layout types. */ -static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, - uint32_t *layouttype) +static int decode_pnfs_layout_types(struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) { __be32 *p; - int num; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { - *layouttype = 0; + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > 1) - printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " - "drivers per filesystem not supported\n", __func__); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - *layouttype = be32_to_cpup(p); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4764,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_first_pnfs_layout_type(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; - } else - *layouttype = 0; + } return status; } @@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c93a85eda51..56b2d96f9103 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/sort.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) } /* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + +/* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. * - * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + * @ids array of layout types supported by MDS. */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 id) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; + u32 id; + int i; - if (id == 0) - goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", - __func__, id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", + __func__, server->nfs_client->cl_exchange_flags); goto out_no_driver; } - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); + + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", - __func__, id); - goto out_no_driver; + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); } + if (ld_type) + break; + } + + if (!ld_type) { + dprintk("%s: No pNFS module found!\n", __func__); + goto out_no_driver; } + server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { @@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(hdr); + if (likely(!hdr->pnfs_error)) hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_read_error(hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 31d99b2927b0..5c295512c967 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 id) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f3468b57a32a..53b4705abcc7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); - if (!IS_ERR(clp)) - break; + if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { + struct xprt_create xprt_args = { + .ident = XPRT_TRANSPORT_TCP, + .net = clp->cl_net, + .dstaddr = (struct sockaddr *)&da->da_addr, + .addrlen = da->da_addrlen, + .servername = clp->cl_hostname, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + .cred = nfs4_get_clid_cred(clp), + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + + /** + * Test this address for session trunking and + * add as an alias + */ + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_rpccred(xprtdata.cred); + } else { + clp = nfs4_set_ds_client(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (IS_ERR(clp)) + continue; + + status = nfs4_init_ds_session(clp, + mds_srv->nfs_client->cl_lease_time); + if (status) { + nfs_put_client(clp); + clp = ERR_PTR(-EIO); + continue; + } + + } } if (IS_ERR(clp)) { @@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, goto out; } - status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); - if (status) - goto out_put; - smp_wmb(); ds->ds_clp = clp; dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; -out_put: - nfs_put_client(clp); - goto out; } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d39601381adf..001796bcd6c8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2848,19 +2848,23 @@ out_invalid_transport_udp: * NFS client for backwards compatibility */ unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_nr_threads; /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600; /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); @@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = { #define param_check_portnr(name, p) __param_check(name, p, unsigned int); module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); +MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be " + "assigned to the NFSv4 callback channels."); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, @@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); +module_param(max_session_cb_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " + "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index df880e9fa71f..b67287383010 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .disable_recalls = true, .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 5fbf3bbd00d0..b10d557f9c9e 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,7 @@ struct nfsd_net { struct list_head client_lru; struct list_head close_lru; struct list_head del_recall_lru; + struct list_head blocked_locks_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 04c68d900324..211dc2aed8e1 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, { int status; - if (cb->cb_minorversion == 0) + if (cb->cb_clp->cl_minorversion == 0) return 0; status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); @@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfs4_cb_compound_hdr hdr = { .ident = 0, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, } #endif /* CONFIG_NFSD_PNFS */ +static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); + p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); + xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); +} + +static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfsd4_blocked_lock *nbl = + container_of(cb, struct nfsd4_blocked_lock, nbl_cb); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + __be32 *p; + + BUG_ON(hdr.minorversion == 0); + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); + encode_nfs_fh4(xdr, &nbl->nbl_fh); + encode_stateowner(xdr, &lo->lo_owner); + hdr.nops++; + + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); +} + /* * RPC procedure tables */ @@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = { #ifdef CONFIG_NFSD_PNFS PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif + PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), }; static struct rpc_version nfs_cb_version4 = { @@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; - cb->cb_minorversion = minorversion; /* * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2be9602b0221..42aace4fc4c8 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); fput(ls->ls_file); if (ls->ls_recalled) @@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) struct file_lock *fl; int status; + if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + return 0; + fl = locks_alloc_lock(); if (!fl) return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1fb222752b2b..abb09b580389 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 -nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_clone *clone) +nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + stateid_t *src_stateid, struct file **src, + stateid_t *dst_stateid, struct file **dst) { - struct file *src, *dst; __be32 status; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - &clone->cl_src_stateid, RD_STATE, - &src, NULL); + src_stateid, RD_STATE, src, NULL); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - &clone->cl_dst_stateid, WR_STATE, - &dst, NULL); + dst_stateid, WR_STATE, dst, NULL); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(src)->i_mode) || - !S_ISREG(file_inode(dst)->i_mode)) { + if (!S_ISREG(file_inode(*src)->i_mode) || + !S_ISREG(file_inode(*dst)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } +out: + return status; +out_put_dst: + fput(*dst); +out_put_src: + fput(*src); + goto out; +} + +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, + &clone->cl_dst_stateid, &dst); + if (status) + goto out; + status = nfsd4_clone_file_range(src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count); -out_put_dst: fput(dst); -out_put_src: fput(src); out: return status; } static __be32 +nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + struct file *src, *dst; + __be32 status; + ssize_t bytes; + + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, + ©->cp_dst_stateid, &dst); + if (status) + goto out; + + bytes = nfsd_copy_file_range(src, copy->cp_src_pos, + dst, copy->cp_dst_pos, copy->cp_count); + + if (bytes < 0) + status = nfserrno(bytes); + else { + copy->cp_res.wr_bytes_written = bytes; + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_consecutive = 1; + copy->cp_synchronous = 1; + gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfs_ok; + } + + fput(src); + fput(dst); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { @@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* wr_callback */ + + op_encode_stateid_maxsz /* wr_callback */ + + 2 /* wr_count */ + + 1 /* wr_committed */ + + op_encode_verifier_maxsz + + 1 /* cr_consecutive */ + + 1 /* cr_synchronous */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS /* * At this stage we don't really know what layout driver will handle the request, @@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_CLONE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_COPY] = { + .op_func = (nfsd4op_func)nfsd4_copy, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_COPY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 39bfaba9c99c..9752beb78659 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } +static struct nfsd4_blocked_lock * +find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *cur, *found = NULL; + + spin_lock(&nn->client_lock); + list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { + if (fh_match(fh, &cur->nbl_fh)) { + list_del_init(&cur->nbl_list); + list_del_init(&cur->nbl_lru); + found = cur; + break; + } + } + spin_unlock(&nn->client_lock); + if (found) + posix_unblock_lock(&found->nbl_lock); + return found; +} + +static struct nfsd4_blocked_lock * +find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = find_blocked_lock(lo, fh, nn); + if (!nbl) { + nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + if (nbl) { + fh_copy_shallow(&nbl->nbl_fh, fh); + locks_init_lock(&nbl->nbl_lock); + nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, + &nfsd4_cb_notify_lock_ops, + NFSPROC4_CLNT_CB_NOTIFY_LOCK); + } + } + return nbl; +} + +static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_release_private(&nbl->nbl_lock); + kfree(nbl); +} + +static int +nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + /* + * Since this is just an optimization, we don't try very hard if it + * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and + * just quit trying on anything else. + */ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 1 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + + free_blocked_lock(nbl); +} + +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .done = nfsd4_cb_notify_lock_done, + .release = nfsd4_cb_notify_lock_release, +}; + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, goto out; /* cases below refer to rfc 3530 section 14.2.34: */ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { - if (conf && !unconf) /* case 2: probable retransmit */ + if (conf && same_verf(&confirm, &conf->cl_confirm)) { + /* case 2: probable retransmit */ status = nfs_ok; - else /* case 4: client hasn't noticed we rebooted yet? */ + } else /* case 4: client hasn't noticed we rebooted yet? */ status = nfserr_stale_clientid; goto out; } @@ -4410,9 +4491,11 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && - !nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; + else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) @@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; @@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn) } spin_unlock(&nn->client_lock); + /* + * It's possible for a client to try and acquire an already held lock + * that is being held for a long time, and then lose interest in it. + * So, we clean out any un-revisited request after a lease period + * under the assumption that the client is no longer interested. + * + * RFC5661, sec. 9.6 states that the client must not rely on getting + * notifications and must continue to poll for locks, even when the + * server supports them. Thus this shouldn't lead to clients blocking + * indefinitely once the lock does become free. + */ + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + if (time_after((unsigned long)nbl->nbl_time, + (unsigned long)cutoff)) { + t = nbl->nbl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); return new_timeo; } @@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +static void +nfsd4_lm_notify(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct net *net = lo->lo_owner.so_client->net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl = container_of(fl, + struct nfsd4_blocked_lock, nbl_lock); + bool queue = false; + + /* An empty list means that something else is going to be using it */ + spin_lock(&nn->client_lock); + if (!list_empty(&nbl->nbl_list)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + queue = true; + } + spin_unlock(&nn->client_lock); + + if (queue) + nfsd4_run_cb(&nbl->nbl_cb); +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_fl_get_owner, .lm_put_owner = nfsd4_fl_put_owner, }; @@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; + INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; @@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct file *filp = NULL; + struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; int lkflg; int err; bool new = false; + unsigned char fl_type; + unsigned int fl_flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; - file_lock = locks_alloc_lock(); - if (!file_lock) { - dprintk("NFSD: %s: unable to allocate lock!\n", __func__); - status = nfserr_jukebox; - goto out; - } - fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { - case NFS4_READ_LT: case NFS4_READW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_READ_LT: spin_lock(&fp->fi_lock); filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_RDLCK; + fl_type = F_RDLCK; break; - case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_WRLCK; + fl_type = F_WRLCK; break; default: status = nfserr_inval; goto out; } + if (!filp) { status = nfserr_openmode; goto out; } + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); + if (!nbl) { + dprintk("NFSD: %s: unable to allocate block!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + file_lock = &nbl->nbl_lock; + file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; - file_lock->fl_flags = FL_POSIX; + file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + if (fl_flags & FL_SLEEP) { + nbl->nbl_time = jiffies; + spin_lock(&nn->client_lock); + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + spin_unlock(&nn->client_lock); + } + err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); - switch (-err) { + switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; - case (EAGAIN): /* conflock holds conflicting lock */ + case FILE_LOCK_DEFERRED: + nbl = NULL; + /* Fallthrough */ + case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; - case (EDEADLK): + case -EDEADLK: status = nfserr_deadlock; break; default: @@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: + if (nbl) { + /* dequeue it if we queued it before */ + if (fl_flags & FL_SLEEP) { + spin_lock(&nn->client_lock); + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + spin_unlock(&nn->client_lock); + } + free_blocked_lock(nbl); + } if (filp) fput(filp); if (lock_stp) { @@ -5753,8 +5930,6 @@ out: if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (file_lock) - locks_free_lock(file_lock); if (conflock) locks_free_lock(conflock); return status; @@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + INIT_LIST_HEAD(&nn->blocked_locks_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0aa0236a1429..c2d2895a1ec1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) } static __be32 +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +{ + DECODE_HEAD; + unsigned int tmp; + + status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8 + 4 + 4 + 4); + p = xdr_decode_hyper(p, ©->cp_src_pos); + p = xdr_decode_hyper(p, ©->cp_dst_pos); + p = xdr_decode_hyper(p, ©->cp_count); + copy->cp_consecutive = be32_to_cpup(p++); + copy->cp_synchronous = be32_to_cpup(p++); + tmp = be32_to_cpup(p); /* Source server list not supported */ + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { /* new operations for NFSv4.2 */ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, u32 starting_len = xdr->buf->len, needed_len; __be32 *p; - dprintk("%s: err %d\n", __func__, nfserr); + dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr)); if (nfserr) goto out; @@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +{ + __be32 *p; + + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(0); + p = xdr_encode_hyper(p, write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_stable_how); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return nfs_ok; +} + +static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + if (!nfserr) { + nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = cpu_to_be32(copy->cp_synchronous); + } + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.2 operations */ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 65ad0165a94f..36b2af931e06 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; + nn->clverifier_counter = prandom_u32(); + nn->clientid_counter = prandom_u32(); return 0; out_idmap_error: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 08188743db53..010aff5c5a79 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -789,6 +789,7 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EUCLEAN }, }; int i; @@ -796,7 +797,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - WARN(1, "nfsd: non-standard errno: %d\n", errno); + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 45007acaf364..a2b65fc56dd6 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = { }; #endif +/* Only used under nfsd_mutex, so this atomic may be overkill: */ +static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier still has clients */ + if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); - register_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier is already set */ + if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { + register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - register_inet6addr_notifier(&nfsd_inet6addr_notifier); + register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 0c2a716e8741..d27a5aa60022 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -19,6 +19,7 @@ struct nfsd4_deviceid_map { struct nfsd4_layout_ops { u32 notify_types; + bool disable_recalls; __be32 (*proc_getdeviceinfo)(struct super_block *sb, struct svc_rqst *rqstp, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b95adf9a1595..c9399366f9df 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,7 +63,6 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - u32 cb_minorversion; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; @@ -441,11 +440,11 @@ struct nfs4_openowner { /* * Represents a generic "lockowner". Similar to an openowner. References to it * are held by the lock stateids that are created on its behalf. This object is - * a superset of the nfs4_stateowner struct (or would be if it needed any extra - * fields). + * a superset of the nfs4_stateowner struct. */ struct nfs4_lockowner { - struct nfs4_stateowner lo_owner; /* must be first element */ + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_blocked; /* blocked file_locks */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -572,6 +571,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, + NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; /* Returns true iff a is later than b: */ @@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) return (s32)(a->si_generation - b->si_generation) > 0; } +/* + * When a client tries to get a lock on a file, we set one of these objects + * on the blocking lock. When the lock becomes free, we can then issue a + * CB_NOTIFY_LOCK to the server. + */ +struct nfsd4_blocked_lock { + struct list_head nbl_list; + struct list_head nbl_lru; + unsigned long nbl_time; + struct file_lock nbl_lock; + struct knfsd_fh nbl_fh; + struct nfsd4_callback nbl_cb; +}; + struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ff476e654b8f..8ca642fe9b21 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, count)); } +ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + + /* + * Limit copy to 4MB to prevent indefinitely blocking an nfsd + * thread and client rpc slot. The choice of 4MB is somewhat + * arbitrary. We might instead base this on r/wsize, or make it + * tunable, or use a time instead of a byte limit, or implement + * asynchronous copy. In theory a client could also recognize a + * limit like this and pipeline multiple COPY requests. + */ + count = min_t(u64, count, 1 << 22); + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3cbb1b33777b..0bf9e7bf5800 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -96,6 +96,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); +ssize_t nfsd_copy_file_range(struct file *, u64, + struct file *, u64, u64); __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index beea0c5edc51..8fda4abdf3b1 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -503,6 +503,28 @@ struct nfsd4_clone { u64 cl_count; }; +struct nfsd42_write_res { + u64 wr_bytes_written; + u32 wr_stable_how; + nfs4_verifier wr_verifier; +}; + +struct nfsd4_copy { + /* request */ + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + + /* both */ + bool cp_consecutive; + bool cp_synchronous; + + /* response */ + struct nfsd42_write_res cp_res; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -568,6 +590,7 @@ struct nfsd4_op { struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; + struct nfsd4_copy copy; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c47f6fdb111a..49b719dfef95 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -28,3 +28,12 @@ #define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 2 + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + enc_nfs4_fh_sz) +#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/open.c b/fs/open.c index a7719cfb7257..d3ed8171e8e0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -267,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; + /* Unshare range should only be used with allocate mode. */ + if ((mode & FALLOC_FL_UNSHARE_RANGE) && + (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 584e87e11cb6..26ef1958b65b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -55,6 +55,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ + xfs_refcount.o \ + xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -88,6 +90,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ @@ -100,16 +103,20 @@ xfs-y += xfs_aops.o \ # low-level transaction/log code xfs-y += xfs_log.o \ xfs_log_cil.o \ + xfs_bmap_item.o \ xfs_buf_item.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ + xfs_trans_bmap.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_refcount.o \ xfs_trans_rmap.o \ # optional features diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e3ae0f2b4294..e5ebc3770460 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -38,6 +38,7 @@ #include "xfs_trans_space.h" #include "xfs_rmap_btree.h" #include "xfs_btree.h" +#include "xfs_refcount_btree.h" /* * Per-AG Block Reservations @@ -108,7 +109,9 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; + return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, + XFS_RANDOM_AG_RESV_CRITICAL); } /* @@ -228,6 +231,11 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; + error = xfs_refcountbt_calc_reserves(pag->pag_mount, + pag->pag_agno, &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ask, used); if (error) @@ -238,6 +246,11 @@ xfs_ag_resv_init( if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; + error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, + &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ca75dc90ebe0..effb64cf714f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -52,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +unsigned int +xfs_refc_block( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { + if (xfs_sb_version_hasreflink(&mp->m_sb)) + return xfs_refc_block(mp) + 1; if (xfs_sb_version_hasrmapbt(&mp->m_sb)) return XFS_RMAP_BLOCK(mp) + 1; if (xfs_sb_version_hasfinobt(&mp->m_sb)) @@ -115,6 +128,8 @@ xfs_alloc_ag_max_usable( blocks++; /* finobt root block */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks++; /* rmap root block */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; } @@ -2321,6 +2336,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), offsetof(xfs_agf_t, agf_rmap_blocks), + offsetof(xfs_agf_t, agf_refcount_blocks), + offsetof(xfs_agf_t, agf_refcount_root), + offsetof(xfs_agf_t, agf_refcount_level), /* needed so that we don't log the whole rest of the structure: */ offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) @@ -2458,6 +2476,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return false; + if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + return false; + return true;; } @@ -2578,6 +2600,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9d7f61d36645..c27344cf38e1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -48,6 +48,7 @@ #include "xfs_filestream.h" #include "xfs_rmap.h" #include "xfs_ag_resv.h" +#include "xfs_refcount.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -140,7 +141,8 @@ xfs_bmbt_lookup_ge( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork); } @@ -150,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork); } @@ -640,6 +643,7 @@ xfs_bmap_btree_to_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); rblock = ifp->if_broot; @@ -706,6 +710,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_ptr_t *pp; /* root block address pointer */ mp = ip->i_mount; + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -748,6 +753,7 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -762,6 +768,21 @@ xfs_bmap_extents_to_btree( xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } + + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + dfops->dop_low = true; + goto try_another_ag; + } /* * Allocation can't fail, the space was reserved. */ @@ -837,6 +858,7 @@ xfs_bmap_local_to_extents_empty( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); @@ -896,6 +918,7 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { +try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -908,6 +931,19 @@ xfs_bmap_local_to_extents( if (error) goto done; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + goto try_another_ag; + } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1670,7 +1706,8 @@ xfs_bmap_one_block( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_bmalloca *bma) + struct xfs_bmalloca *bma, + int whichfork) { struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ @@ -1688,11 +1725,14 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; + xfs_extnum_t *nextents; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); + ASSERT(whichfork != XFS_ATTR_FORK); + nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : + &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1706,6 +1746,9 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + /* * Set up a bunch of variables to make the tests simpler. */ @@ -1792,7 +1835,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); - bma->ip->i_d.di_nextents--; + (*nextents)--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1894,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, new->br_startblock); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1964,7 +2007,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2048,7 +2091,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2117,7 +2160,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2215,7 +2258,8 @@ xfs_bmap_add_extent_delay_real( xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - bma->logflags |= rval; + if (whichfork != XFS_COW_FORK) + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -2759,6 +2803,7 @@ done: STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new) /* new data to add to file extents */ { @@ -2770,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); state = 0; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2789,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2923,6 +2970,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); @@ -3648,7 +3696,9 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - if (xfs_alloc_is_userdata(ap->datatype)) + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3856,7 +3906,8 @@ xfs_bmap_btalloc( ASSERT(nullfb || fb_agno == args.agno || (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; - ap->ip->i_d.di_nblocks += args.len; + if (!(ap->flags & XFS_BMAPI_COWFORK)) + ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) ap->ip->i_delayed_blks -= args.len; @@ -3876,6 +3927,63 @@ xfs_bmap_btalloc( } /* + * For a remap operation, just "allocate" an extent at the address that the + * caller passed in, and ensure that the AGFL is the right size. The caller + * will then map the "allocated" extent into the file somewhere. + */ +STATIC int +xfs_bmap_remap_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_trans *tp = ap->tp; + struct xfs_mount *mp = tp->t_mountp; + xfs_agblock_t bno; + struct xfs_alloc_arg args; + int error; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + memset(&args, 0, sizeof(struct xfs_alloc_arg)); + args.tp = ap->tp; + args.mp = ap->tp->t_mountp; + bno = *ap->firstblock; + args.agno = XFS_FSB_TO_AGNO(mp, bno); + args.agbno = XFS_FSB_TO_AGBNO(mp, bno); + if (args.agno >= mp->m_sb.sb_agcount || + args.agbno >= mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + /* "Allocate" the extent from the range we passed in. */ + trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); + ap->blkno = bno; + ap->ip->i_d.di_nblocks += ap->length; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + + /* Fix the freelist, like a real allocator does. */ + args.datatype = ap->datatype; + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + /* + * The freelist fixing code will decline the allocation if + * the size and shape of the free space doesn't allow for + * allocating the extent and updating all the metadata that + * happens during an allocation. We're remapping, not + * allocating, so skip that check by pretending to be freeing. + */ + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; +error0: + xfs_perag_put(args.pag); + if (error) + trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); + return error; +} + +/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3883,6 +3991,8 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { + if (ap->flags & XFS_BMAPI_REMAP) + return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4012,12 +4122,11 @@ xfs_bmapi_read( int error; int eof; int n = 0; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_IGSTATE))); + XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( @@ -4035,6 +4144,16 @@ xfs_bmapi_read( ifp = XFS_IFORK_PTR(ip, whichfork); + /* No CoW fork? Return a hole. */ + if (whichfork == XFS_COW_FORK && !ifp) { + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = len; + mval->br_state = XFS_EXT_NORM; + *nmap = 1; + return 0; + } + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4084,6 +4203,7 @@ xfs_bmapi_read( int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, + int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, @@ -4092,7 +4212,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; char rt = XFS_IS_REALTIME_INODE(ip); @@ -4104,7 +4224,10 @@ xfs_bmapi_reserve_delalloc( alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); + if (whichfork == XFS_COW_FORK) + extsz = xfs_get_cowextsz_hint(ip); + else + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); @@ -4151,7 +4274,7 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay @@ -4182,8 +4305,7 @@ xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4278,7 +4400,7 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) - error = xfs_bmap_add_extent_delay_real(bma); + error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma, whichfork); @@ -4308,8 +4430,7 @@ xfs_bmapi_convert_unwritten( xfs_filblks_t len, int flags) { - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4325,6 +4446,8 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; + ASSERT(whichfork != XFS_COW_FORK); + /* * Modify (by adding) the state flag, if writing. */ @@ -4431,8 +4554,7 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); @@ -4441,6 +4563,11 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4502,6 +4629,14 @@ xfs_bmapi_write( wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); /* + * Make sure we only reflink into a hole. + */ + if (flags & XFS_BMAPI_REMAP) + ASSERT(inhole); + if (flags & XFS_BMAPI_COWFORK) + ASSERT(!inhole); + + /* * First, deal with the hole before the allocated space * that we found, if any. */ @@ -4531,6 +4666,17 @@ xfs_bmapi_write( goto error0; if (bma.blkno == NULLFSBLOCK) break; + + /* + * If this is a CoW allocation, record the data in + * the refcount btree for orphan recovery. + */ + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(mp, dfops, + bma.blkno, bma.length); + if (error) + goto error0; + } } /* Deal with the allocated space we found. */ @@ -4696,7 +4842,8 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int bflags) /* bmapi flags */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -4725,6 +4872,8 @@ xfs_bmap_del_extent( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / @@ -4805,6 +4954,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); --*idx; @@ -4988,9 +5138,16 @@ xfs_bmap_del_extent( /* * If we need to, add to list of extents to delete. */ - if (do_fx) - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { + error = xfs_refcount_decrease_extent(mp, dfops, del); + if (error) + goto done; + } else + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); + } + /* * Adjust inode # blocks in the file. */ @@ -4999,7 +5156,7 @@ xfs_bmap_del_extent( /* * Adjust quota data. */ - if (qfield) + if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); /* @@ -5014,6 +5171,175 @@ done: return error; } +/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */ +int +xfs_bunmapi_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *del) +{ + xfs_filblks_t da_new; + xfs_filblks_t da_old; + xfs_fsblock_t del_endblock = 0; + xfs_fileoff_t del_endoff; + int delay; + struct xfs_bmbt_rec_host *ep; + int error; + struct xfs_bmbt_irec got; + xfs_fileoff_t got_endoff; + struct xfs_ifork *ifp; + struct xfs_mount *mp; + xfs_filblks_t nblks; + struct xfs_bmbt_irec new; + /* REFERENCED */ + uint qfield; + xfs_filblks_t temp; + xfs_filblks_t temp2; + int state = BMAP_COWFORK; + int eof; + xfs_extnum_t eidx; + + mp = ip->i_mount; + XFS_STATS_INC(mp, xs_del_exlist); + + ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, + &eidx, &got, &new); + + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; + ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + } + qfield = qfield; + nblks = nblks; + + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); + --eidx; + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + } else { + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + xfs_iext_insert(ip, eidx + 1, 1, &new, state); + ++eidx; + break; + } + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); + + return error; +} + /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to @@ -5021,17 +5347,16 @@ done: * *done is set. */ int /* error */ -xfs_bunmapi( +__xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ + xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - struct xfs_defer_ops *dfops, /* i/o: list extents to free */ - int *done) /* set if not done yet */ + struct xfs_defer_ops *dfops) /* i/o: deferred updates */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ @@ -5053,11 +5378,12 @@ xfs_bunmapi( int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); if (unlikely( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5079,7 +5405,7 @@ xfs_bunmapi( return error; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); if (nextents == 0) { - *done = 1; + *rlen = 0; return 0; } XFS_STATS_INC(mp, xs_blk_unmap); @@ -5324,7 +5650,7 @@ xfs_bunmapi( cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork); + &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; if (error) goto error0; @@ -5350,7 +5676,10 @@ nodelete: extno++; } } - *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + *rlen = 0; + else + *rlen = bno - start + 1; /* * Convert to a btree if necessary. @@ -5406,6 +5735,27 @@ error0: return error; } +/* Unmap a range of a file. */ +int +xfs_bunmapi( + xfs_trans_t *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_extnum_t nexts, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops, + int *done) +{ + int error; + + error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock, + dfops); + *done = (len == 0); + return error; +} + /* * Determine whether an extent shift can be accomplished by a merge with the * extent that precedes the target hole of the shift. @@ -5985,3 +6335,146 @@ out: xfs_trans_cancel(tp); return error; } + +/* Deferred mapping is only for real extents in the data fork. */ +static bool +xfs_bmap_is_update_needed( + struct xfs_bmbt_irec *bmap) +{ + return bmap->br_startblock != HOLESTARTBLOCK && + bmap->br_startblock != DELAYSTARTBLOCK; +} + +/* Record a bmap intent. */ +static int +__xfs_bmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + int error; + struct xfs_bmap_intent *bi; + + trace_xfs_bmap_defer(mp, + XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + ip->i_ino, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&bi->bi_list); + bi->bi_type = type; + bi->bi_owner = ip; + bi->bi_whichfork = whichfork; + bi->bi_bmap = *bmap; + + error = xfs_defer_join(dfops, bi->bi_owner); + if (error) { + kmem_free(bi); + return error; + } + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_bmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip, + XFS_DATA_FORK, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_bmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip, + XFS_DATA_FORK, PREV); +} + +/* + * Process one of the deferred bmap operations. We pass back the + * btree cursor to maintain our lock on the bmapbt between calls. + */ +int +xfs_bmap_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + struct xfs_bmbt_irec bmap; + int nimaps = 1; + xfs_fsblock_t firstfsb; + int flags = XFS_BMAPI_REMAP; + int done; + int error = 0; + + bmap.br_startblock = startblock; + bmap.br_startoff = startoff; + bmap.br_blockcount = blockcount; + bmap.br_state = state; + + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), + ip->i_ino, whichfork, startoff, blockcount, state); + + if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + return -EFSCORRUPTED; + if (whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; + + if (XFS_TEST_ERROR(false, tp->t_mountp, + XFS_ERRTAG_BMAP_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE)) + return -EIO; + + switch (type) { + case XFS_BMAP_MAP: + firstfsb = bmap.br_startblock; + error = xfs_bmapi_write(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, &firstfsb, + bmap.br_blockcount, &bmap, &nimaps, + dfops); + break; + case XFS_BMAP_UNMAP: + error = xfs_bunmapi(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, 1, &firstfsb, + dfops, &done); + ASSERT(done); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8395f6e8cf7d..f97db7132564 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -97,6 +97,19 @@ struct xfs_extent_free_item */ #define XFS_BMAPI_ZERO 0x080 +/* + * Map the inode offset to the block given in ap->firstblock. Primarily + * used for reflink. The range must be in a hole, and this flag cannot be + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. + * + * For bunmapi, this flag unmaps the range without adjusting quota, reducing + * refcount, or freeing the blocks. + */ +#define XFS_BMAPI_REMAP 0x100 + +/* Map something in the CoW fork. */ +#define XFS_BMAPI_COWFORK 0x200 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -105,12 +118,24 @@ struct xfs_extent_free_item { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_ZERO, "ZERO" } + { XFS_BMAPI_ZERO, "ZERO" }, \ + { XFS_BMAPI_REMAP, "REMAP" }, \ + { XFS_BMAPI_COWFORK, "COWFORK" } static inline int xfs_bmapi_aflag(int w) { - return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : + (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); +} + +static inline int xfs_bmapi_whichfork(int bmapi_flags) +{ + if (bmapi_flags & XFS_BMAPI_COWFORK) + return XFS_COW_FORK; + else if (bmapi_flags & XFS_BMAPI_ATTRFORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; } /* @@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w) #define BMAP_LEFT_VALID (1 << 6) #define BMAP_RIGHT_VALID (1 << 7) #define BMAP_ATTRFORK (1 << 8) +#define BMAP_COWFORK (1 << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ - { BMAP_ATTRFORK, "ATTR" } + { BMAP_ATTRFORK, "ATTR" }, \ + { BMAP_COWFORK, "COW" } /* @@ -186,10 +213,15 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, struct xfs_defer_ops *dfops); +int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); +int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); @@ -203,8 +235,31 @@ struct xfs_bmbt_rec_host * xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, int fork, int *eofp, xfs_extnum_t *lastxp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff, - xfs_filblks_t len, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t aoff, xfs_filblks_t len, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, int eof); + +enum xfs_bmap_intent_type { + XFS_BMAP_MAP = 1, + XFS_BMAP_UNMAP, +}; + +struct xfs_bmap_intent { + struct list_head bi_list; + enum xfs_bmap_intent_type bi_type; + struct xfs_inode *bi_owner; + int bi_whichfork; + struct xfs_bmbt_irec bi_bmap; +}; + +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, enum xfs_bmap_intent_type type, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); +int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cd85274e810c..8007d2ba9aef 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,6 +453,7 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; /* * Make sure there is sufficient room left in the AG to @@ -482,6 +483,22 @@ xfs_bmbt_alloc_block( if (error) goto error0; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + cur->bc_private.b.dfops->dop_low = true; + args.fsbno = cur->bc_private.b.firstblock; + goto try_another_ag; + } + if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy @@ -777,6 +794,7 @@ xfs_bmbt_init_cursor( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index aa1752f918b8..5c8e6f2ce44f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone; */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC }, + XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, + XFS_REFC_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -1216,6 +1217,9 @@ xfs_btree_set_refs( case XFS_BTNUM_RMAP: xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); break; + case XFS_BTNUM_REFC: + xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); + break; default: ASSERT(0); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3f8556a5c2ad..c2b01d1c79ee 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -49,6 +49,7 @@ union xfs_btree_key { struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; + struct xfs_refcount_key refc; }; union xfs_btree_rec { @@ -57,6 +58,7 @@ union xfs_btree_rec { struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; + struct xfs_refcount_rec refc; }; /* @@ -72,6 +74,7 @@ union xfs_btree_rec { #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) +#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) /* * For logging record fields. @@ -105,6 +108,7 @@ do { \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ + case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -127,6 +131,8 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ case XFS_BTNUM_RMAP: \ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ + case XFS_BTNUM_REFC: \ + __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -217,6 +223,15 @@ union xfs_btree_irec { struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; + struct xfs_refcount_irec rc; +}; + +/* Per-AG btree private information. */ +union xfs_btree_cur_private { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; }; /* @@ -243,6 +258,7 @@ typedef struct xfs_btree_cur struct xfs_buf *agbp; /* agf/agi buffer pointer */ struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ + union xfs_btree_cur_private priv; } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index e96533d178cf..f6e93ef0bffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -51,6 +51,8 @@ struct xfs_defer_pending { * find all the space it needs. */ enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_BMAP, + XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_MAX, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 270fb5cf4fa1..f6547fc5e016 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -456,9 +456,11 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ - XFS_SB_FEAT_RO_COMPAT_RMAPBT) + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ + XFS_SB_FEAT_RO_COMPAT_REFLINK) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); } +static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); +} + /* * end of superblock version macros */ @@ -641,14 +649,17 @@ typedef struct xfs_agf { uuid_t agf_uuid; /* uuid of filesystem */ __be32 agf_rmap_blocks; /* rmapbt blocks used */ - __be32 agf_padding; /* padding */ + __be32 agf_refcount_blocks; /* refcountbt blocks used */ + + __be32 agf_refcount_root; /* refcount tree root block */ + __be32 agf_refcount_level; /* refcount btree levels */ /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[15]; + __be64 agf_spare64[14]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -674,8 +685,11 @@ typedef struct xfs_agf { #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 #define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_SPARE64 0x00004000 -#define XFS_AGF_NUM_BITS 15 +#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 +#define XFS_AGF_REFCOUNT_ROOT 0x00008000 +#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 +#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_NUM_BITS 18 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -693,6 +707,9 @@ typedef struct xfs_agf { { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ { XFS_AGF_UUID, "UUID" }, \ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \ + { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \ { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ @@ -885,7 +902,8 @@ typedef struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ + __be32 di_cowextsize; /* basic cow extent size for file */ + __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_timestamp_t di_crtime; /* time created */ @@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * 16 bits of the XFS_XFLAG_s range. */ #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) +#define XFS_DIFLAG2_ANY \ + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) /* * Inode number format: @@ -1353,7 +1376,9 @@ struct xfs_owner_info { #define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ #define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ #define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ -#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ +#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */ +#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */ +#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */ #define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) @@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Reference Count Btree format definitions + * + */ +#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */ + +unsigned int xfs_refc_block(struct xfs_mount *mp); + +/* + * Data record/key structure + * + * Each record associates a range of physical blocks (starting at + * rc_startblock and ending rc_blockcount blocks later) with a reference + * count (rc_refcount). Extents that are being used to stage a copy on + * write (CoW) operation are recorded in the refcount btree with a + * refcount of 1. All other records must have a refcount > 1 and must + * track an extent mapped only by file data forks. + * + * Extents with a single owner (attributes, metadata, non-shared file + * data) are not tracked here. Free space is also not tracked here. + * This is consistent with pre-reflink XFS. + */ + +/* + * Extents that are being used to stage a copy on write are stored + * in the refcount btree with a refcount of 1 and the upper bit set + * on the startblock. This speeds up mount time deletion of stale + * staging extents because they're all at the right side of the tree. + */ +#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define REFCNTBT_COWFLAG_BITLEN 1 +#define REFCNTBT_AGBLOCK_BITLEN 31 + +struct xfs_refcount_rec { + __be32 rc_startblock; /* starting block number */ + __be32 rc_blockcount; /* count of blocks */ + __be32 rc_refcount; /* number of inodes linked here */ +}; + +struct xfs_refcount_key { + __be32 rc_startblock; /* starting block number */ +}; + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ +}; + +#define MAXREFCOUNT ((xfs_nlink_t)~0U) +#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) + +/* btree pointer type */ +typedef __be32 xfs_refcount_ptr_t; + + +/* * BMAP Btree format definitions * * This includes both the root block definition that sits inside an inode fork diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 79455058b752..b72dc821d78b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -81,14 +81,16 @@ struct getbmapx { #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */ #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ - BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ +#define BMV_OF_SHARED 0x8 /* segment shared with another file */ /* * Structure for XFS_IOC_FSSETDM. @@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -275,7 +278,8 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[10]; /* pad space, unused */ + unsigned char bs_pad[6]; /* pad space, unused */ + __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4b9769e23c83..8de9a3a29589 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -256,6 +256,7 @@ xfs_inode_from_disk( to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } } @@ -305,7 +306,7 @@ xfs_inode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); - + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -357,6 +358,7 @@ xfs_log_dinode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); @@ -373,6 +375,9 @@ xfs_dinode_verify( struct xfs_inode *ip, struct xfs_dinode *dip) { + uint16_t flags; + uint64_t flags2; + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; @@ -389,6 +394,23 @@ xfs_dinode_verify( return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; + + flags = be16_to_cpu(dip->di_flags); + flags2 = be64_to_cpu(dip->di_flags2); + + /* don't allow reflink/cowextsize if we don't have reflink */ + if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + + /* don't let reflink and realtime mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + return false; + + /* don't let reflink and dax mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) + return false; + return true; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7c4dd321b215..62d9d4681c8c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -47,6 +47,7 @@ struct xfs_icdinode { __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint64_t di_flags2; /* more random flags */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bbcc8c7a44b3..5dd56d3dbb3a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -121,6 +121,26 @@ xfs_iformat_fork( return -EFSCORRUPTED; } + if (unlikely(xfs_is_reflink_inode(ip) && + (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, wrong file type for reflink.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(xfs_is_reflink_inode(ip) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, has reflink+realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: @@ -186,9 +206,14 @@ xfs_iformat_fork( XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } - if (error) { + if (error) return error; + + if (xfs_is_reflink_inode(ip)) { + ASSERT(ip->i_cowfp == NULL); + xfs_ifork_init_cow(ip); } + if (!XFS_DFORK_Q(dip)) return 0; @@ -208,7 +233,8 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(8)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -226,6 +252,9 @@ xfs_iformat_fork( if (error) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + if (ip->i_cowfp) + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } return error; @@ -740,6 +769,9 @@ xfs_idestroy_fork( if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + } else if (whichfork == XFS_COW_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; } } @@ -927,6 +959,19 @@ xfs_iext_get_ext( } } +/* Convert bmap state flags to an inode fork. */ +struct xfs_ifork * +xfs_iext_state_to_fork( + struct xfs_inode *ip, + int state) +{ + if (state & BMAP_COWFORK) + return ip->i_cowfp; + else if (state & BMAP_ATTRFORK) + return ip->i_afp; + return &ip->i_df; +} + /* * Insert new item(s) into the extent records for incore inode * fork 'ifp'. 'count' new items are inserted at index 'idx'. @@ -939,7 +984,7 @@ xfs_iext_insert( xfs_bmbt_irec_t *new, /* items to insert */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t i; /* extent record index */ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); @@ -1189,7 +1234,7 @@ xfs_iext_remove( int ext_diff, /* number of extents to remove */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t nextents; /* number of extents in file */ int new_size; /* size of extents after removal */ @@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Initialize an inode's copy-on-write fork. + */ +void +xfs_ifork_init_cow( + struct xfs_inode *ip) +{ + if (ip->i_cowfp) + return; + + ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, + KM_SLEEP | KM_NOFS); + ip->i_cowfp->if_flags = XFS_IFEXTENTS; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; + ip->i_cnextents = 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index f95e072ae646..c9476f50e32d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -92,7 +92,9 @@ typedef struct xfs_ifork { #define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ - (ip)->i_afp) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_afp : \ + (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ @@ -105,26 +107,38 @@ typedef struct xfs_ifork { #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) + ((w) == XFS_ATTR_FORK ? \ + XFS_IFORK_ASIZE(ip) : \ + 0)) #define XFS_IFORK_FORMAT(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_aformat : \ + (ip)->i_cformat)) #define XFS_IFORK_FMT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_aformat = (n)) : \ + ((ip)->i_cformat = (n)))) #define XFS_IFORK_NEXTENTS(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_anextents : \ + (ip)->i_cnextents)) #define XFS_IFORK_NEXT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_anextents = (n)) : \ + ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); @@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); extern struct kmem_zone *xfs_ifork_zone; +extern void xfs_ifork_init_cow(struct xfs_inode *ip); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fc5eef85d61e..083cdd6d6c28 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_MAX 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_MAX 26 /* * Flags to log operation header @@ -231,6 +235,10 @@ typedef struct xfs_trans_header { #define XFS_LI_ICREATE 0x123f #define XFS_LI_RUI 0x1240 /* rmap update intent */ #define XFS_LI_RUD 0x1241 +#define XFS_LI_CUI 0x1242 /* refcount update intent */ +#define XFS_LI_CUD 0x1243 +#define XFS_LI_BUI 0x1244 /* bmbt update intent */ +#define XFS_LI_BUD 0x1245 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -242,7 +250,11 @@ typedef struct xfs_trans_header { { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ { XFS_LI_RUI, "XFS_LI_RUI" }, \ - { XFS_LI_RUD, "XFS_LI_RUD" } + { XFS_LI_RUD, "XFS_LI_RUD" }, \ + { XFS_LI_CUI, "XFS_LI_CUI" }, \ + { XFS_LI_CUD, "XFS_LI_CUD" }, \ + { XFS_LI_BUI, "XFS_LI_BUI" }, \ + { XFS_LI_BUD, "XFS_LI_BUD" } /* * Inode Log Item Format definitions. @@ -411,7 +423,8 @@ struct xfs_log_dinode { __uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ __uint64_t di_flags2; /* more random flags */ - __uint8_t di_pad2[16]; /* more padding for future expansion */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ + __uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -622,8 +635,11 @@ struct xfs_map_extent { /* rmap me_flags: upper bits are flags, lower byte is type code */ #define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_MAP_SHARED 2 #define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_UNMAP_SHARED 4 #define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_CONVERT_SHARED 6 #define XFS_RMAP_EXTENT_ALLOC 7 #define XFS_RMAP_EXTENT_FREE 8 #define XFS_RMAP_EXTENT_TYPE_MASK 0xFF @@ -671,6 +687,102 @@ struct xfs_rud_log_format { }; /* + * CUI/CUD (refcount update) log format definitions + */ +struct xfs_phys_extent { + __uint64_t pe_startblock; + __uint32_t pe_len; + __uint32_t pe_flags; +}; + +/* refcount pe_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_refcount_intent_type. */ +#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF + +#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK) + +/* + * This is the structure used to lay out a cui log item in the + * log. The cui_extents field is a variable size array whose + * size is given by cui_nextents. + */ +struct xfs_cui_log_format { + __uint16_t cui_type; /* cui log item type */ + __uint16_t cui_size; /* size of this item */ + __uint32_t cui_nextents; /* # extents to free */ + __uint64_t cui_id; /* cui identifier */ + struct xfs_phys_extent cui_extents[]; /* array of extents */ +}; + +static inline size_t +xfs_cui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_cui_log_format) + + nr * sizeof(struct xfs_phys_extent); +} + +/* + * This is the structure used to lay out a cud log item in the + * log. The cud_extents array is a variable size array whose + * size is given by cud_nextents; + */ +struct xfs_cud_log_format { + __uint16_t cud_type; /* cud log item type */ + __uint16_t cud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t cud_cui_id; /* id of corresponding cui */ +}; + +/* + * BUI/BUD (inode block mapping) log format definitions + */ + +/* bmbt me_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_bmap_intent_type. */ +#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) + +#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ + XFS_BMAP_EXTENT_ATTR_FORK | \ + XFS_BMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an bui log item in the + * log. The bui_extents field is a variable size array whose + * size is given by bui_nextents. + */ +struct xfs_bui_log_format { + __uint16_t bui_type; /* bui log item type */ + __uint16_t bui_size; /* size of this item */ + __uint32_t bui_nextents; /* # extents to free */ + __uint64_t bui_id; /* bui identifier */ + struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ +}; + +static inline size_t +xfs_bui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_bui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + +/* + * This is the structure used to lay out an bud log item in the + * log. The bud_extents array is a variable size array whose + * size is given by bud_nextents; + */ +struct xfs_bud_log_format { + __uint16_t bud_type; /* bud log item type */ + __uint16_t bud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t bud_bui_id; /* id of corresponding bui */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c new file mode 100644 index 000000000000..b177ef33cd4c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_refcount.h" +#include "xfs_rmap.h" + +/* Allowable refcount adjustment amounts. */ +enum xfs_refc_adjust_op { + XFS_REFCOUNT_ADJUST_INCREASE = 1, + XFS_REFCOUNT_ADJUST_DECREASE = -1, + XFS_REFCOUNT_ADJUST_COW_ALLOC = 0, + XFS_REFCOUNT_ADJUST_COW_FREE = -1, +}; + +STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); +STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); + +/* + * Look up the first record less than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Look up the first record greater than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_ge( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_GE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* Convert on-disk record to in-core format. */ +static inline void +xfs_refcount_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec) +{ + irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); + irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + xfs_refcount_btrec_to_irec(rec, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, + irec); + } + return error; +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_update( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); + rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_refcount_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Insert the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_insert( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *i) +{ + int error; + + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; + cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; + cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + error = xfs_btree_insert(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +out_error: + if (error) + trace_xfs_refcount_insert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Remove the record referred to by cur, then set the pointer to the spot + * where the record could be re-inserted, in case we want to increment or + * decrement the cursor. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_delete( + struct xfs_btree_cur *cur, + int *i) +{ + struct xfs_refcount_irec irec; + int found_rec; + int error; + + error = xfs_refcount_get_rec(cur, &irec, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + error = xfs_btree_delete(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (error) + goto out_error; + error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); +out_error: + if (error) + trace_xfs_refcount_delete_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Adjusting the Reference Count + * + * As stated elsewhere, the reference count btree (refcbt) stores + * >1 reference counts for extents of physical blocks. In this + * operation, we're either raising or lowering the reference count of + * some subrange stored in the tree: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+--------- + * 2 | | 3 | 4 | |17| 55 | 10 + * ----+ +---+-----+ +--+--------+--------- + * X axis is physical blocks number; + * reference counts are the numbers inside the rectangles + * + * The first thing we need to do is to ensure that there are no + * refcount extents crossing either boundary of the range to be + * adjusted. For any extent that does cross a boundary, split it into + * two extents so that we can increment the refcount of one of the + * pieces later: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+----+---- + * 2 | | 3 | 2 | |17| 55 | 10 | 10 + * ----+ +---+-----+ +--+--------+----+---- + * + * For this next step, let's assume that all the physical blocks in + * the adjustment range are mapped to a file and are therefore in use + * at least once. Therefore, we can infer that any gap in the + * refcount tree within the adjustment range represents a physical + * extent with refcount == 1: + * + * <------ adjustment range ------> + * ----+---+---+-----+-+--+--------+----+---- + * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10 + * ----+---+---+-----+-+--+--------+----+---- + * ^ + * + * For each extent that falls within the interval range, figure out + * which extent is to the left or the right of that extent. Now we + * have a left, current, and right extent. If the new reference count + * of the center extent enables us to merge left, center, and right + * into one record covering all three, do so. If the center extent is + * at the left end of the range, abuts the left extent, and its new + * reference count matches the left extent's record, then merge them. + * If the center extent is at the right end of the range, abuts the + * right extent, and the reference counts match, merge those. In the + * example, we can left merge (assuming an increment operation): + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 3 | 2 |1|17| 55 | 10 | 10 + * --------+---+-----+-+--+--------+----+---- + * ^ + * + * For all other extents within the range, adjust the reference count + * or delete it if the refcount falls below 2. If we were + * incrementing, the end result looks like this: + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 4 | 3 |2|18| 56 | 11 | 10 + * --------+---+-----+-+--+--------+----+---- + * + * The result of a decrement operation looks as such: + * + * <------ adjustment range ------> + * ----+ +---+ +--+--------+----+---- + * 2 | | 2 | |16| 54 | 9 | 10 + * ----+ +---+ +--+--------+----+---- + * DDDD 111111DD + * + * The blocks marked "D" are freed; the blocks marked "1" are only + * referenced once and therefore the record is removed from the + * refcount btree. + */ + +/* Next block after this extent. */ +static inline xfs_agblock_t +xfs_refc_next( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock + rc->rc_blockcount; +} + +/* + * Split a refcount extent that crosses agbno. + */ +STATIC int +xfs_refcount_split_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + bool *shape_changed) +{ + struct xfs_refcount_irec rcext, tmp; + int found_rec; + int error; + + *shape_changed = false; + error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &rcext, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) + return 0; + + *shape_changed = true; + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + &rcext, agbno); + + /* Establish the right extent. */ + tmp = rcext; + tmp.rc_startblock = agbno; + tmp.rc_blockcount -= (agbno - rcext.rc_startblock); + error = xfs_refcount_update(cur, &tmp); + if (error) + goto out_error; + + /* Insert the left extent. */ + tmp = rcext; + tmp.rc_blockcount = agbno - rcext.rc_startblock; + error = xfs_refcount_insert(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + return error; + +out_error: + trace_xfs_refcount_split_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge the left, center, and right extents. + */ +STATIC int +xfs_refcount_merge_center_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *center, + struct xfs_refcount_irec *right, + unsigned long long extlen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_center_extents(cur->bc_mp, + cur->bc_private.a.agno, left, center, right); + + /* + * Make sure the center and right extents are not in the btree. + * If the center extent was synthesized, the first delete call + * removes the right extent and we skip the second deletion. + * If center and right were in the btree, then the first delete + * call removes the center and the second one removes the right + * extent. + */ + error = xfs_refcount_lookup_ge(cur, center->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (center->rc_refcount > 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount = extlen; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *aglen = 0; + return error; + +out_error: + trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the left extent. + */ +STATIC int +xfs_refcount_merge_left_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_left_extent(cur->bc_mp, + cur->bc_private.a.agno, left, cleft); + + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ + if (cleft->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount += cleft->rc_blockcount; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *agbno += cleft->rc_blockcount; + *aglen -= cleft->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the right extent. + */ +STATIC int +xfs_refcount_merge_right_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_right_extent(cur->bc_mp, + cur->bc_private.a.agno, cright, right); + + /* + * If the extent ending at agbno+aglen (cright) wasn't synthesized, + * remove it. + */ + if (cright->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cright->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the right extent. */ + error = xfs_refcount_lookup_le(cur, right->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + right->rc_startblock -= cright->rc_blockcount; + right->rc_blockcount += cright->rc_blockcount; + error = xfs_refcount_update(cur, right); + if (error) + goto out_error; + + *aglen -= cright->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#define XFS_FIND_RCEXT_SHARED 1 +#define XFS_FIND_RCEXT_COW 2 +/* + * Find the left extent and the one after it (cleft). This function assumes + * that we've already split any extent crossing agbno. + */ +STATIC int +xfs_refcount_find_left_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (xfs_refc_next(&tmp) != agbno) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a left extent; retrieve (or invent) the next right one */ + *left = tmp; + + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp starts at the end of our range, just use that */ + if (tmp.rc_startblock == agbno) + *cleft = tmp; + else { + /* + * There's a gap in the refcntbt at the start of the + * range we're interested in (refcount == 1) so + * synthesize the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = min(aglen, + tmp.rc_startblock - agbno); + cleft->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = aglen; + cleft->rc_refcount = 1; + } + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + left, cleft, agbno); + return error; + +out_error: + trace_xfs_refcount_find_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find the right extent and the one before it (cright). This function + * assumes that we've already split any extents crossing agbno + aglen. + */ +STATIC int +xfs_refcount_find_right_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (tmp.rc_startblock != agbno + aglen) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a right extent; retrieve (or invent) the next left one */ + *right = tmp; + + error = xfs_btree_decrement(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp ends at the end of our range, just use that */ + if (xfs_refc_next(&tmp) == agbno + aglen) + *cright = tmp; + else { + /* + * There's a gap in the refcntbt at the end of the + * range we're interested in (refcount == 1) so + * create the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cright->rc_startblock = max(agbno, xfs_refc_next(&tmp)); + cright->rc_blockcount = right->rc_startblock - + cright->rc_startblock; + cright->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cright->rc_startblock = agbno; + cright->rc_blockcount = aglen; + cright->rc_refcount = 1; + } + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + cright, right, agbno + aglen); + return error; + +out_error: + trace_xfs_refcount_find_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Is this extent valid? */ +static inline bool +xfs_refc_valid( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock != NULLAGBLOCK; +} + +/* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +STATIC int +xfs_refcount_merge_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adjust, + int flags, + bool *shape_changed) +{ + struct xfs_refcount_irec left = {0}, cleft = {0}; + struct xfs_refcount_irec cright = {0}, right = {0}; + int error; + unsigned long long ulen; + bool cequal; + + *shape_changed = false; + /* + * Find the extent just below agbno [left], just above agbno [cleft], + * just below (agbno + aglen) [cright], and just above (agbno + aglen) + * [right]. + */ + error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, + *aglen, flags); + if (error) + return error; + error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, + *aglen, flags); + if (error) + return error; + + /* No left or right extent to merge; exit. */ + if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right)) + return 0; + + cequal = (cleft.rc_startblock == cright.rc_startblock) && + (cleft.rc_blockcount == cright.rc_blockcount); + + /* Try to merge left, cleft, and right. cleft must == cright. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + + right.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && + xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && + left.rc_refcount == cleft.rc_refcount + adjust && + right.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_center_extents(cur, &left, &cleft, + &right, ulen, agbno, aglen); + } + + /* Try to merge left and cleft. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && + left.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + error = xfs_refcount_merge_left_extent(cur, &left, &cleft, + agbno, aglen); + if (error) + return error; + + /* + * If we just merged left + cleft and cleft == cright, + * we no longer have a cright to merge with right. We're done. + */ + if (cequal) + return 0; + } + + /* Try to merge cright and right. */ + ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; + if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && + right.rc_refcount == cright.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_right_extent(cur, &right, &cright, + agbno, aglen); + } + + return error; +} + +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 24 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + * + * XXX: This is a pretty hand-wavy estimate. The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. + * Be conservative here. + */ +static bool +xfs_refcount_still_have_space( + struct xfs_btree_cur *cur) +{ + unsigned long overhead; + + overhead = cur->bc_private.a.priv.refc.shape_changes * + xfs_allocfree_log_count(cur->bc_mp, 1); + overhead *= cur->bc_mp->m_sb.sb_blocksize; + + /* + * Only allow 2 refcount extent updates per transaction if the + * refcount continue update "error" has been injected. + */ + if (cur->bc_private.a.priv.refc.nr_ops > 2 && + XFS_TEST_ERROR(false, cur->bc_mp, + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + return false; + + if (cur->bc_private.a.priv.refc.nr_ops == 0) + return true; + else if (overhead > cur->bc_tp->t_log_res) + return false; + return cur->bc_tp->t_log_res - overhead > + cur->bc_private.a.priv.refc.nr_ops * 32; +} + +/* + * Adjust the refcounts of middle extents. At this point we should have + * split extents that crossed the adjustment range; merged with adjacent + * extents; and updated agbno/aglen to reflect the merges. Therefore, + * all we have to do is update the extents inside [agbno, agbno + aglen]. + */ +STATIC int +xfs_refcount_adjust_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + xfs_fsblock_t fsbno; + + /* Merging did all the work already. */ + if (*aglen == 0) + return 0; + + error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + if (error) + goto out_error; + + while (*aglen > 0 && xfs_refcount_still_have_space(cur)) { + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + /* + * Deal with a hole in the refcount tree; if a file maps to + * these blocks and there's no refcountbt record, pretend that + * there is one with refcount == 1. + */ + if (ext.rc_startblock != *agbno) { + tmp.rc_startblock = *agbno; + tmp.rc_blockcount = min(*aglen, + ext.rc_startblock - *agbno); + tmp.rc_refcount = 1 + adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + /* + * Either cover the hole (increment) or + * delete the range (decrement). + */ + if (tmp.rc_refcount) { + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + tmp.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + tmp.rc_blockcount, oinfo); + } + + (*agbno) += tmp.rc_blockcount; + (*aglen) -= tmp.rc_blockcount; + + error = xfs_refcount_lookup_ge(cur, *agbno, + &found_rec); + if (error) + goto out_error; + } + + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* + * Adjust the reference count and either update the tree + * (incr) or free the blocks (decr). + */ + if (ext.rc_refcount == MAXREFCOUNT) + goto skip; + ext.rc_refcount += adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + if (ext.rc_refcount > 1) { + error = xfs_refcount_update(cur, &ext); + if (error) + goto out_error; + cur->bc_private.a.priv.refc.nr_ops++; + } else if (ext.rc_refcount == 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + goto advloop; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + ext.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + ext.rc_blockcount, oinfo); + } + +skip: + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + +advloop: + (*agbno) += ext.rc_blockcount; + (*aglen) -= ext.rc_blockcount; + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Adjust the reference count of a range of AG blocks. */ +STATIC int +xfs_refcount_adjust( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *new_agbno, + xfs_extlen_t *new_aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + bool shape_changed; + int shape_changes = 0; + int error; + + *new_agbno = agbno; + *new_aglen = aglen; + if (adj == XFS_REFCOUNT_ADJUST_INCREASE) + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + else + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, + XFS_FIND_RCEXT_SHARED, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + if (shape_changes) + cur->bc_private.a.priv.refc.shape_changes++; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, + adj, dfops, oinfo); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* Clean up after calling xfs_refcount_finish_one. */ +void +xfs_refcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_refcount_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_agblock_t new_agbno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_REFCOUNT_FINISH_ONE, + XFS_RANDOM_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + nr_ops = rcur->bc_private.a.priv.refc.nr_ops; + shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + xfs_refcount_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + XFS_ALLOC_FLAG_FREEING, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + rcur->bc_private.a.priv.refc.nr_ops = nr_ops; + rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + } + *pcur = rcur; + + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_ALLOC_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops); + break; + case XFS_REFCOUNT_FREE_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (!error && *new_len > 0) + trace_xfs_refcount_finish_one_leftover(mp, agno, type, + bno, blockcount, new_agbno, *new_len); + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Record a refcount intent for later processing. + */ +static int +__xfs_refcount_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount) +{ + struct xfs_refcount_intent *ri; + + trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + ri = kmem_alloc(sizeof(struct xfs_refcount_intent), + KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_startblock = startblock; + ri->ri_blockcount = blockcount; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + return 0; +} + +/* + * Increase the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_increase_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Decrease the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_decrease_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is set, return the longest contiguous extent of + * shared blocks; if not, just return the first extent we find. If no + * shared blocks are found, fbno and flen will be set to NULLAGBLOCK + * and 0, respectively. + */ +int +xfs_refcount_find_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_refcount_irec tmp; + int i; + int have; + int error; + + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* By default, skip the whole range */ + *fbno = NULLAGBLOCK; + *flen = 0; + + /* Try to find a refcount extent that crosses the start */ + error = xfs_refcount_lookup_le(cur, agbno, &have); + if (error) + goto out_error; + if (!have) { + /* No left extent, look at the next one */ + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + } + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + + /* If the extent ends before the start, look at the next one */ + if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + } + + /* If the extent starts after the range we want, bail out */ + if (tmp.rc_startblock >= agbno + aglen) + goto done; + + /* We found the start of a shared extent! */ + if (tmp.rc_startblock < agbno) { + tmp.rc_blockcount -= (agbno - tmp.rc_startblock); + tmp.rc_startblock = agbno; + } + + *fbno = tmp.rc_startblock; + *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno); + if (!find_end_of_shared) + goto done; + + /* Otherwise, find the end of this shared extent */ + while (*fbno + *flen < agbno + aglen) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + break; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (tmp.rc_startblock >= agbno + aglen || + tmp.rc_startblock != *fbno + *flen) + break; + *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); + } + +done: + trace_xfs_refcount_find_shared_result(cur->bc_mp, + cur->bc_private.a.agno, *fbno, *flen); + +out_error: + if (error) + trace_xfs_refcount_find_shared_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Recovering CoW Blocks After a Crash + * + * Due to the way that the copy on write mechanism works, there's a window of + * opportunity in which we can lose track of allocated blocks during a crash. + * Because CoW uses delayed allocation in the in-core CoW fork, writeback + * causes blocks to be allocated and stored in the CoW fork. The blocks are + * no longer in the free space btree but are not otherwise recorded anywhere + * until the write completes and the blocks are mapped into the file. A crash + * in between allocation and remapping results in the replacement blocks being + * lost. This situation is exacerbated by the CoW extent size hint because + * allocations can hang around for long time. + * + * However, there is a place where we can record these allocations before they + * become mappings -- the reference count btree. The btree does not record + * extents with refcount == 1, so we can record allocations with a refcount of + * 1. Blocks being used for CoW writeout cannot be shared, so there should be + * no conflict with shared block records. These mappings should be created + * when we allocate blocks to the CoW fork and deleted when they're removed + * from the CoW fork. + * + * Minor nit: records for in-progress CoW allocations and records for shared + * extents must never be merged, to preserve the property that (except for CoW + * allocations) there are no refcount btree entries with refcount == 1. The + * only time this could potentially happen is when unsharing a block that's + * adjacent to CoW allocations, so we must be careful to avoid this. + * + * At mount time we recover lost CoW allocations by searching the refcount + * btree for these refcount == 1 mappings. These represent CoW allocations + * that were in progress at the time the filesystem went down, so we can free + * them to get the space back. + * + * This mechanism is superior to creating EFIs for unmapped CoW extents for + * several reasons -- first, EFIs pin the tail of the log and would have to be + * periodically relogged to avoid filling up the log. Second, CoW completions + * will have to file an EFD and create new EFIs for whatever remains in the + * CoW fork; this partially takes care of (1) but extent-size reservations + * will have to periodically relog even if there's no writeout in progress. + * This can happen if the CoW extent size hint is set, which you really want. + * Third, EFIs cannot currently be automatically relogged into newer + * transactions to advance the log tail. Fourth, stuffing the log full of + * EFIs places an upper bound on the number of CoW allocations that can be + * held filesystem-wide at any given time. Recording them in the refcount + * btree doesn't require us to maintain any state in memory and doesn't pin + * the log. + */ +/* + * Adjust the refcounts of CoW allocations. These allocations are "magic" + * in that they're not referenced anywhere else in the filesystem, so we + * stash them in the refcount btree with a refcount of 1 until either file + * remapping (or CoW cancellation) happens. + */ +STATIC int +xfs_refcount_adjust_cow_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + + if (aglen == 0) + return 0; + + /* Find any overlapping refcount records */ + error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + if (error) + goto out_error; + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + + XFS_REFC_COW_START; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + switch (adj) { + case XFS_REFCOUNT_ADJUST_COW_ALLOC: + /* Adding a CoW reservation, there should be nothing here. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock >= agbno + aglen, out_error); + + tmp.rc_startblock = agbno; + tmp.rc_blockcount = aglen; + tmp.rc_refcount = 1; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + break; + case XFS_REFCOUNT_ADJUST_COW_FREE: + /* Removing a CoW reservation, there should be one extent. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock == agbno, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_blockcount == aglen, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_refcount == 1, out_error); + + ext.rc_refcount = 0; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + break; + default: + ASSERT(0); + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Add or remove refcount btree entries for CoW reservations. + */ +STATIC int +xfs_refcount_adjust_cow( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops) +{ + bool shape_changed; + int error; + + agbno += XFS_REFC_COW_START; + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, + XFS_FIND_RCEXT_COW, &shape_changed); + if (error) + goto out_error; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, + dfops, NULL); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Record a CoW allocation in the refcount btree. + */ +STATIC int +__xfs_refcount_cow_alloc( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Add refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + if (error) + return error; + + /* Add rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* + * Remove a CoW allocation from the refcount btree. + */ +STATIC int +__xfs_refcount_cow_free( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Remove refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + if (error) + return error; + + /* Remove rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_free_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* Record a CoW staging extent in the refcount btree. */ +int +xfs_refcount_alloc_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + fsb, len); +} + +/* Forget a CoW staging event in the refcount btree. */ +int +xfs_refcount_free_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, + fsb, len); +} + +struct xfs_refcount_recovery { + struct list_head rr_list; + struct xfs_refcount_irec rr_rrec; +}; + +/* Stuff an extent on the recovery list. */ +STATIC int +xfs_refcount_recover_extent( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct list_head *debris = priv; + struct xfs_refcount_recovery *rr; + + if (be32_to_cpu(rec->refc.rc_refcount) != 1) + return -EFSCORRUPTED; + + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); + list_add_tail(&rr->rr_list, debris); + + return 0; +} + +/* Find and remove leftover CoW reservations. */ +int +xfs_refcount_recover_cow_leftovers( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_trans *tp; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_refcount_recovery *rr, *n; + struct list_head debris; + union xfs_btree_irec low; + union xfs_btree_irec high; + struct xfs_defer_ops dfops; + xfs_fsblock_t fsb; + xfs_agblock_t agbno; + int error; + + if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + return -EOPNOTSUPP; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ + INIT_LIST_HEAD(&debris); + memset(&low, 0, sizeof(low)); + memset(&high, 0, sizeof(high)); + low.rc.rc_startblock = XFS_REFC_COW_START; + high.rc.rc_startblock = -1U; + error = xfs_btree_query_range(cur, &low, &high, + xfs_refcount_recover_extent, &debris); + if (error) + goto out_cursor; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + /* Now iterate the list to free the leftovers */ + list_for_each_entry(rr, &debris, rr_list) { + /* Set up transaction. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + goto out_free; + + trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + + /* Free the orphan record */ + xfs_defer_init(&dfops, &fsb); + agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; + fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount); + if (error) + goto out_defer; + + /* Free the block. */ + xfs_bmap_add_free(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount, NULL); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_defer; + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + } + +out_free: + /* Free the leftover list */ + list_for_each_entry_safe(rr, n, &debris, rr_list) { + list_del(&rr->rr_list); + kmem_free(rr); + } + return error; + +out_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_buf_relse(agbp); + goto out_free; + +out_defer: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_free; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h new file mode 100644 index 000000000000..098dc668ab2c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_H__ +#define __XFS_REFCOUNT_H__ + +extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); + +enum xfs_refcount_intent_type { + XFS_REFCOUNT_INCREASE = 1, + XFS_REFCOUNT_DECREASE, + XFS_REFCOUNT_ALLOC_COW, + XFS_REFCOUNT_FREE_COW, +}; + +struct xfs_refcount_intent { + struct list_head ri_list; + enum xfs_refcount_intent_type ri_type; + xfs_fsblock_t ri_startblock; + xfs_extlen_t ri_blockcount; +}; + +extern int xfs_refcount_increase_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); +extern int xfs_refcount_decrease_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); + +extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +extern int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur); + +extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_end_of_shared); + +extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, + xfs_agnumber_t agno); + +#endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c new file mode 100644 index 000000000000..453bb2757ec2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_rmap.h" + +static struct xfs_btree_cur * +xfs_refcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_private.a.dfops); +} + +STATIC void +xfs_refcountbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_refcount_root = ptr->s; + be32_add_cpu(&agf->agf_refcount_level, inc); + pag->pagf_refcount_level += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, + XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); +} + +STATIC int +xfs_refcountbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_alloc_arg args; /* block allocation args */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + xfs_refc_block(args.mp)); + args.firstblock = args.fsbno; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.minlen = args.maxlen = args.prod = 1; + args.resv = XFS_AG_RESV_METADATA; + + error = xfs_alloc_vextent(&args); + if (error) + goto out_error; + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + args.agbno, 1); + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.len == 1); + + new->s = cpu_to_be32(args.agbno); + be32_add_cpu(&agf->agf_refcount_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out_error: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_refcountbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; + int error; + + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + be32_add_cpu(&agf->agf_refcount_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_METADATA); + if (error) + return error; + + return error; +} + +STATIC int +xfs_refcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mnr[level != 0]; +} + +STATIC int +xfs_refcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mxr[level != 0]; +} + +STATIC void +xfs_refcountbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_refcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_refcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_refcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_refcount_root != 0); + + ptr->s = agf->agf_refcount_root; +} + +STATIC __int64_t +xfs_refcountbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_refcount_irec *rec = &cur->bc_rec.rc; + struct xfs_refcount_key *kp = &key->refc; + + return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; +} + +STATIC __int64_t +xfs_refcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC bool +xfs_refcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_refcount_level) + return false; + } else if (level >= mp->m_refc_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); +} + +STATIC void +xfs_refcountbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_refcountbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +STATIC void +xfs_refcountbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_refcountbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_refcountbt_buf_ops = { + .name = "xfs_refcountbt", + .verify_read = xfs_refcountbt_read_verify, + .verify_write = xfs_refcountbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_refcountbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_refcountbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} +#endif + +static const struct xfs_btree_ops xfs_refcountbt_ops = { + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + + .dup_cursor = xfs_refcountbt_dup_cursor, + .set_root = xfs_refcountbt_set_root, + .alloc_block = xfs_refcountbt_alloc_block, + .free_block = xfs_refcountbt_free_block, + .get_minrecs = xfs_refcountbt_get_minrecs, + .get_maxrecs = xfs_refcountbt_get_maxrecs, + .init_key_from_rec = xfs_refcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, + .key_diff = xfs_refcountbt_key_diff, + .buf_ops = &xfs_refcountbt_buf_ops, + .diff_two_keys = xfs_refcountbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_refcountbt_keys_inorder, + .recs_inorder = xfs_refcountbt_recs_inorder, +#endif +}; + +/* + * Allocate a new refcount btree cursor. + */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + struct xfs_defer_ops *dfops) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agno < mp->m_sb.sb_agcount); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = XFS_BTNUM_REFC; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_refcountbt_ops; + + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + cur->bc_private.a.dfops = dfops; + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.priv.refc.nr_ops = 0; + cur->bc_private.a.priv.refc.shape_changes = 0; + + return cur; +} + +/* + * Calculate the number of records in a refcount btree block. + */ +int +xfs_refcountbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + bool leaf) +{ + blocklen -= XFS_REFCOUNT_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + +/* Compute the maximum height of a refcount btree. */ +void +xfs_refcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_refcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_refcountbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_refc_mxr[0] == 0) + return 0; + + return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_refcountbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + *ask += xfs_refcountbt_max_size(mp); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_refcount_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h new file mode 100644 index 000000000000..3be7768bd51a --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_BTREE_H__ +#define __XFS_REFCOUNT_BTREE_H__ + +/* + * Reference Count Btree on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size + */ +#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_REFCOUNT_REC_ADDR(block, index) \ + ((struct xfs_refcount_rec *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_refcount_rec)))) + +#define XFS_REFCOUNT_KEY_ADDR(block, index) \ + ((struct xfs_refcount_key *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct xfs_refcount_key))) + +#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \ + ((xfs_refcount_ptr_t *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (maxrecs) * sizeof(struct xfs_refcount_key) + \ + ((index) - 1) * sizeof(xfs_refcount_ptr_t))) + +extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_defer_ops *dfops); +extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, + bool leaf); +extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); + +extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); + +extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + +#endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 73d05407d663..3a8cc7139912 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -148,6 +148,37 @@ done: return error; } +STATIC int +xfs_rmap_delete( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + + error = xfs_btree_delete(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_delete_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + static int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, @@ -180,6 +211,160 @@ xfs_rmap_get_rec( return xfs_rmap_btrec_to_irec(rec, irec); } +struct xfs_find_left_neighbor_info { + struct xfs_rmap_irec high; + struct xfs_rmap_irec *irec; + int *stat; +}; + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_find_left_neighbor_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and adjacent physical and logical + * block ranges. + */ +int +xfs_rmap_find_left_neighbor( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + *stat = 0; + if (bno == 0) + return 0; + info.high.rm_startblock = bno - 1; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && + !(flags & XFS_RMAP_BMBT_BLOCK)) { + if (offset == 0) + return 0; + info.high.rm_offset = offset - 1; + } else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_lookup_le_range_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + (rec->rm_offset > info->high.rm_offset || + rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and overlapping physical and logical + * block ranges. This is the overlapping-interval version of + * xfs_rmap_lookup_le. + */ +int +xfs_rmap_lookup_le_range( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + info.high.rm_startblock = bno; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK)) + info.high.rm_offset = offset; + else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + *stat = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_lookup_le_range(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + /* * Find the extent in the rmap btree and remove it. * @@ -1093,11 +1278,704 @@ done: return error; } +/* + * Convert an unwritten extent to a real extent or vice versa. If there is no + * possibility of overlapping extents, delegate to the simpler convert + * function. + */ +STATIC int +xfs_rmap_convert_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for and exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + &PREV, &i); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext, + &LEFT, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + newext, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += RIGHT.rm_blockcount; + NEW.rm_flags = RIGHT.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW = RIGHT; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new middle extent - newext */ + NEW.rm_startblock = bno; + NEW.rm_blockcount = len; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_flags = newext; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + #undef NEW #undef LEFT #undef RIGHT #undef PREV +/* + * Find an extent in the rmap btree and unmap it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _free function. + */ +STATIC int +xfs_rmap_unmap_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltoff = ltrec.rm_offset; + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Check the offset. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, + out_error); + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* Exact match, simply remove the record from rmap tree. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock == bno) { + /* + * Overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + + /* Delete prev rmap. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + + /* Add an rmap at the new offset. */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + ltrec.rm_offset += len; + error = xfs_rmap_insert(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * Overlap right hand side of extent: trim the length and + * update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + /* + * Overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + /* Shrink the left side of the rmap */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + /* Add an rmap at the new offset */ + error = xfs_rmap_insert(cur, bno + len, + orig_len - len - ltrec.rm_blockcount, + ltrec.rm_owner, offset + len, + ltrec.rm_flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find an extent in the rmap btree and map it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _alloc function. + */ +STATIC int +xfs_rmap_map_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, + <rec, &have_lt); + if (error) + goto out_error; + if (have_lt && + !xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + flags, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + ltrec.rm_offset + ltrec.rm_blockcount == offset) { + /* + * Left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } + + /* Point the cursor back to the left record and update. */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + /* Delete the old record. */ + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + + /* Move the start and re-add it. */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + gtrec.rm_offset = offset; + error = xfs_rmap_insert(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } else { + /* + * No contiguous edge with identical owner, insert + * new record at current cursor position. + */ + error = xfs_rmap_insert(cur, bno, len, owner, offset, flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -1237,15 +2115,27 @@ xfs_rmap_finish_one( case XFS_RMAP_MAP: error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_MAP_SHARED: + error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_UNMAP_SHARED: + error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_CONVERT: error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, &oinfo); break; + case XFS_RMAP_CONVERT_SHARED: + error = xfs_rmap_convert_shared(rcur, bno, blockcount, + !unwritten, &oinfo); + break; default: ASSERT(0); error = -EFSCORRUPTED; @@ -1263,9 +2153,10 @@ out_cur: */ static bool xfs_rmap_update_is_needed( - struct xfs_mount *mp) + struct xfs_mount *mp, + int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb); + return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; } /* @@ -1311,10 +2202,11 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } @@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } @@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); @@ -1386,7 +2280,7 @@ xfs_rmap_free_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 71cf99a4acba..789930599339 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 17b8eeb34ac8..83e672ff7577 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -35,6 +35,7 @@ #include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" /* * Reverse map btree. @@ -512,6 +513,83 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, - mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + /* + * On a non-reflink filesystem, the maximum number of rmap + * records is the number of blocks in the AG, hence the max + * rmapbt height is log_$maxrecs($agblocks). However, with + * reflink each AG block can have up to 2^32 (per the refcount + * record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. + * + * That effectively means that the max rmapbt height must be + * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG + * blocks to feed the rmapbt long before the rmapbt reaches + * maximum height. The reflink code uses ag_resv_critical to + * disallow reflinking when less than 10% of the per-AG metadata + * block reservation since the fallback is a regular file copy. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; + else + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_rmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_rmapbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rmap_mxr[0] == 0) + return 0; + + return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_rmapbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t pool_len; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + /* Reserve 1% of the AG or enough for 1 block per record. */ + pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); + *ask += pool_len; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_rmap_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index e73a55357dab..2a9ac472fb15 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); + +extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4aecc5fefe96..a70aec910626 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -38,6 +38,8 @@ #include "xfs_ialloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -737,6 +739,13 @@ xfs_sb_mount_common( mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; + mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 0c5b30bd884c..c6f4eb46fe26 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 +#define XFS_REFC_BTREE_REF 1 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 301ef2f4dbd6..b456cca1bfb2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -67,13 +67,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). The number of blocks reserved is based on the formula: + * (rmapbt). With reflink, there are four trees (refcountbt). The number of + * blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ -static uint +uint xfs_allocfree_log_count( struct xfs_mount *mp, uint num_ops) @@ -83,6 +84,8 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } @@ -809,11 +812,18 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + else + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -870,7 +880,10 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0eb46ed6d404..b7e5357d060a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -87,6 +87,7 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -96,11 +97,13 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); +uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 41e0428d8175..7917f6e44286 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,6 +21,8 @@ /* * Components of space reservations. */ +#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ + (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -28,6 +30,13 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w) + \ + ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + (mp)->m_rmap_maxlevels) #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3d503647f26b..8d74870468c2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ */ #define XFS_DATA_FORK 0 #define XFS_ATTR_FORK 1 +#define XFS_COW_FORK 2 /* * Min numbers of data/attr fork btree root pointers. @@ -109,7 +110,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4a28fa91e3b1..3e57a56cf829 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -39,6 +40,7 @@ /* flags for direct write completions */ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) +#define XFS_DIO_FLAG_COW (1 << 2) /* * structure owned by writepages passed to individual writepage calls @@ -287,6 +289,25 @@ xfs_end_io( error = -EIO; /* + * For a CoW extent, we need to move the mapping from the CoW fork + * to the data fork. If instead an error happened, just dump the + * new blocks. + */ + if (ioend->io_type == XFS_IO_COW) { + if (error) + goto done; + if (ioend->io_bio->bi_error) { + error = xfs_reflink_cancel_cow_range(ip, + ioend->io_offset, ioend->io_size); + goto done; + } + error = xfs_reflink_end_cow(ip, ioend->io_offset, + ioend->io_size); + if (error) + goto done; + } + + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. * Detecting and handling completion IO errors is done individually @@ -301,7 +322,8 @@ xfs_end_io( } else if (ioend->io_append_trans) { error = xfs_setfilesize_ioend(ioend, error); } else { - ASSERT(!xfs_ioend_is_append(ioend)); + ASSERT(!xfs_ioend_is_append(ioend) || + ioend->io_type == XFS_IO_COW); } done: @@ -315,7 +337,7 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -341,6 +363,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -355,6 +378,13 @@ xfs_map_blocks( offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, imap, &nimaps, bmapi_flags); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This forces us + * to come back to writepage to take care of the CoW. + */ + if (nimaps && type == XFS_IO_OVERWRITE) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -362,7 +392,8 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, imap); + error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, + imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return error; @@ -737,6 +768,56 @@ out_invalidate: return; } +static int +xfs_map_cow( + struct xfs_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int *new_type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_bmbt_irec imap; + bool is_cow = false, need_alloc = false; + int error; + + /* + * If we already have a valid COW mapping keep using it. + */ + if (wpc->io_type == XFS_IO_COW) { + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); + if (wpc->imap_valid) { + *new_type = XFS_IO_COW; + return 0; + } + } + + /* + * Else we need to check if there is a COW mapping at this offset. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!is_cow) + return 0; + + /* + * And if the COW mapping has a delayed extent here we need to + * allocate real space for it now. + */ + if (need_alloc) { + error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, + &imap); + if (error) + return error; + } + + wpc->io_type = *new_type = XFS_IO_COW; + wpc->imap_valid = true; + wpc->imap = imap; + return 0; +} + /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate @@ -769,6 +850,7 @@ xfs_writepage_map( int error = 0; int count = 0; int uptodate = 1; + unsigned int new_type; bh = head = page_buffers(page); offset = page_offset(page); @@ -789,22 +871,13 @@ xfs_writepage_map( continue; } - if (buffer_unwritten(bh)) { - if (wpc->io_type != XFS_IO_UNWRITTEN) { - wpc->io_type = XFS_IO_UNWRITTEN; - wpc->imap_valid = false; - } - } else if (buffer_delay(bh)) { - if (wpc->io_type != XFS_IO_DELALLOC) { - wpc->io_type = XFS_IO_DELALLOC; - wpc->imap_valid = false; - } - } else if (buffer_uptodate(bh)) { - if (wpc->io_type != XFS_IO_OVERWRITE) { - wpc->io_type = XFS_IO_OVERWRITE; - wpc->imap_valid = false; - } - } else { + if (buffer_unwritten(bh)) + new_type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + new_type = XFS_IO_DELALLOC; + else if (buffer_uptodate(bh)) + new_type = XFS_IO_OVERWRITE; + else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* @@ -817,6 +890,17 @@ xfs_writepage_map( continue; } + if (xfs_is_reflink_inode(XFS_I(inode))) { + error = xfs_map_cow(wpc, inode, offset, &new_type); + if (error) + goto out; + } + + if (wpc->io_type != new_type) { + wpc->io_type = new_type; + wpc->imap_valid = false; + } + if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); @@ -1107,18 +1191,24 @@ xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool is_cow) { uintptr_t *flags = (uintptr_t *)&bh_result->b_private; xfs_off_t size = bh_result->b_size; trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); + ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : + XFS_IO_OVERWRITE, imap); if (ISUNWRITTEN(imap)) { *flags |= XFS_DIO_FLAG_UNWRITTEN; set_buffer_defer_completion(bh_result); - } else if (offset + size > i_size_read(inode) || offset + size < 0) { + } else if (is_cow) { + *flags |= XFS_DIO_FLAG_COW; + set_buffer_defer_completion(bh_result); + } + if (offset + size > i_size_read(inode) || offset + size < 0) { *flags |= XFS_DIO_FLAG_APPEND; set_buffer_defer_completion(bh_result); } @@ -1164,6 +1254,44 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } +/* Bounce unaligned directio writes to the page cache. */ +static int +xfs_bounce_unaligned_dio_write( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t delta; + bool shared; + bool x; + int error; + + irec = *imap; + if (offset_fsb > irec.br_startoff) { + delta = offset_fsb - irec.br_startoff; + irec.br_blockcount -= delta; + irec.br_startblock += delta; + irec.br_startoff = offset_fsb; + } + error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); + if (error) + return error; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If any part + * of the extent is shared, fall back to buffered mode to handle + * the RMW. This is done by returning -EREMCHG ("remote addr + * changed"), which is caught further up the call stack. + */ + if (shared) { + trace_xfs_reflink_bounce_dio_write(ip, imap); + return -EREMCHG; + } + return 0; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1183,6 +1311,8 @@ __xfs_get_blocks( xfs_off_t offset; ssize_t size; int new = 0; + bool is_cow = false; + bool need_alloc = false; BUG_ON(create && !direct); @@ -1208,8 +1338,26 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (create && direct && xfs_is_reflink_inode(ip)) + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, + &need_alloc); + if (!is_cow) { + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This + * forces us to come back to get_blocks to take care of + * the CoW. + */ + if (create && direct && nimaps && + imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(&imap)) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, + &imap); + } + ASSERT(!need_alloc); if (error) goto out_unlock; @@ -1261,6 +1409,13 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { + if (create && direct && !is_cow) { + error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, + &imap); + if (error) + return error; + } + xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); @@ -1269,7 +1424,8 @@ __xfs_get_blocks( if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + is_cow); } } @@ -1391,11 +1547,14 @@ xfs_end_io_direct_write( i_size_write(inode, offset + size); spin_unlock(&ip->i_flags_lock); + if (flags & XFS_DIO_FLAG_COW) + error = xfs_reflink_end_cow(ip, offset, size); if (flags & XFS_DIO_FLAG_UNWRITTEN) { trace_xfs_end_io_direct_write_unwritten(ip, offset, size); error = xfs_iomap_write_unwritten(ip, offset, size); - } else if (flags & XFS_DIO_FLAG_APPEND) { + } + if (flags & XFS_DIO_FLAG_APPEND) { trace_xfs_end_io_direct_write_append(ip, offset, size); error = xfs_setfilesize(ip, offset, size); @@ -1425,6 +1584,17 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); + + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseÑ• the file system for actual I/O. We really can't allow + * that on reflinks inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 1950e3bca2ac..b3c6634f9518 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -28,13 +28,15 @@ enum { XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ + XFS_IO_COW, /* covers copy-on-write extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" } + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c new file mode 100644 index 000000000000..9bf57c76623b --- /dev/null +++ b/fs/xfs/xfs_bmap_item.c @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_bui_zone; +kmem_zone_t *xfs_bud_zone; + +static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bui_log_item, bui_item); +} + +void +xfs_bui_item_free( + struct xfs_bui_log_item *buip) +{ + kmem_zone_free(xfs_bui_zone, buip); +} + +STATIC void +xfs_bui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bui log item. We use only 1 iovec, and we point that + * at the bui_log_format structure embedded in the bui item. + * It is at this point that we assert that all of the extent + * slots in the bui item have been filled. + */ +STATIC void +xfs_bui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&buip->bui_next_extent) == + buip->bui_format.bui_nextents); + + buip->bui_format.bui_type = XFS_LI_BUI; + buip->bui_format.bui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format, + xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents)); +} + +/* + * Pinning has no meaning for an bui item, so just return. + */ +STATIC void +xfs_bui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an BUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the BUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the BUI to either construct + * and commit the BUD or drop the BUD's reference in the event of error. Simply + * drop the log's BUI reference now that the log is done with it. + */ +STATIC void +xfs_bui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + xfs_bui_release(buip); +} + +/* + * BUI items have no locking or pushing. However, since BUIs are pulled from + * the AIL when their corresponding BUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the BUI out of + * the AIL. + */ +STATIC uint +xfs_bui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an BUD isn't going to be + * constructed and thus we free the BUI here directly. + */ +STATIC void +xfs_bui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_bui_item_free(BUI_ITEM(lip)); +} + +/* + * The BUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_bui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The BUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bui log items. + */ +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_pin = xfs_bui_item_pin, + .iop_unpin = xfs_bui_item_unpin, + .iop_unlock = xfs_bui_item_unlock, + .iop_committed = xfs_bui_item_committed, + .iop_push = xfs_bui_item_push, + .iop_committing = xfs_bui_item_committing, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} + +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + +static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bud_log_item, bud_item); +} + +STATIC void +xfs_bud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_bud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bud log item. We use only 1 iovec, and we point that + * at the bud_log_format structure embedded in the bud item. + * It is at this point that we assert that all of the extent + * slots in the bud item have been filled. + */ +STATIC void +xfs_bud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + budp->bud_format.bud_type = XFS_LI_BUD; + budp->bud_format.bud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format, + sizeof(struct xfs_bud_log_format)); +} + +/* + * Pinning has no meaning for an bud item, so just return. + */ +STATIC void +xfs_bud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an bud item, unpinning does + * not either. + */ +STATIC void +xfs_bud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an bud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_bud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. + */ +STATIC void +xfs_bud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + } +} + +/* + * When the bud item is committed to disk, all we need to do is delete our + * reference to our partner bui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_bud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + /* + * Drop the BUI reference regardless of whether the BUD has been + * aborted. Once the BUD transaction is constructed, it is the sole + * responsibility of the BUD to release the BUI (even if the BUI is + * aborted due to log I/O error). + */ + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + + return (xfs_lsn_t)-1; +} + +/* + * The BUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bud log items. + */ +static const struct xfs_item_ops xfs_bud_item_ops = { + .iop_size = xfs_bud_item_size, + .iop_format = xfs_bud_item_format, + .iop_pin = xfs_bud_item_pin, + .iop_unpin = xfs_bud_item_unpin, + .iop_unlock = xfs_bud_item_unlock, + .iop_committed = xfs_bud_item_committed, + .iop_push = xfs_bud_item_push, + .iop_committing = xfs_bud_item_committing, +}; + +/* + * Allocate and initialize an bud item with the given number of extents. + */ +struct xfs_bud_log_item * +xfs_bud_init( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) + +{ + struct xfs_bud_log_item *budp; + + budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return budp; +} + +/* + * Process a bmap update intent item that was recovered from the log. + * We need to update some inode's bmbt. + */ +int +xfs_bui_recover( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) +{ + int error = 0; + unsigned int bui_type; + struct xfs_map_extent *bmap; + xfs_fsblock_t startblock_fsb; + xfs_fsblock_t inode_fsb; + bool op_ok; + struct xfs_bud_log_item *budp; + enum xfs_bmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_defer_ops dfops; + xfs_fsblock_t firstfsb; + + ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + + /* Only one mapping operation per BUI... */ + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + /* + * First check the validity of the extent described by the + * BUI. If anything is bad, then toss the BUI. + */ + bmap = &buip->bui_format.bui_extents[0]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); + inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, + XFS_INO_TO_FSB(mp, bmap->me_owner))); + switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + bmap->me_len == 0 || + inode_fsb == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + bmap->me_len >= mp->m_sb.sb_agblocks || + inode_fsb >= mp->m_sb.sb_dblocks || + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { + /* + * This will pull the BUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + budp = xfs_trans_get_bud(tp, buip); + + /* Grab the inode. */ + error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + if (error) + goto err_inode; + + if (VFS_I(ip)->i_nlink == 0) + xfs_iflags_set(ip, XFS_IRECOVERY); + xfs_defer_init(&dfops, &firstfsb); + + /* Process deferred bmap item. */ + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + type = bui_type; + break; + default: + error = -EFSCORRUPTED; + goto err_dfops; + } + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + ip, whichfork, bmap->me_startoff, + bmap->me_startblock, bmap->me_len, + state); + if (error) + goto err_dfops; + + /* Finish transaction, free inodes. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto err_dfops; + + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + + return error; + +err_dfops: + xfs_defer_cancel(&dfops); +err_inode: + xfs_trans_cancel(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + } + return error; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h new file mode 100644 index 000000000000..c867daae4a3c --- /dev/null +++ b/fs/xfs/xfs_bmap_item.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_BMAP_ITEM_H__ +#define __XFS_BMAP_ITEM_H__ + +/* + * There are (currently) two pairs of bmap btree redo item types: map & unmap. + * The common abbreviations for these are BUI (bmap update intent) and BUD + * (bmap update done). The redo item type is encoded in the flags field of + * each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * bmbt metadata updates in the non-first transaction. + */ + +/* kernel only BUI/BUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_BUI_MAX_FAST_EXTENTS 1 + +/* + * Define BUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_BUI_RECOVERED 1 + +/* + * This is the "bmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "bmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_bui_log_item { + struct xfs_log_item bui_item; + atomic_t bui_refcount; + atomic_t bui_next_extent; + unsigned long bui_flags; /* misc flags */ + struct xfs_bui_log_format bui_format; +}; + +static inline size_t +xfs_bui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_bui_log_item, bui_format) + + xfs_bui_log_format_sizeof(nr); +} + +/* + * This is the "bmap update done" log item. It is used to log the fact that + * some bmbt updates mentioned in an earlier bui item have been performed. + */ +struct xfs_bud_log_item { + struct xfs_log_item bud_item; + struct xfs_bui_log_item *bud_buip; + struct xfs_bud_log_format bud_format; +}; + +extern struct kmem_zone *xfs_bui_zone; +extern struct kmem_zone *xfs_bud_zone; + +struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); +struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, + struct xfs_bui_log_item *); +void xfs_bui_item_free(struct xfs_bui_log_item *); +void xfs_bui_release(struct xfs_bui_log_item *); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); + +#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e827d657c314..552465e011ec 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -42,6 +42,9 @@ #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_refcount.h" /* Kernel only BMAP related definitions and functions */ @@ -389,11 +392,13 @@ xfs_bmap_count_blocks( STATIC int xfs_getbmapx_fix_eof_hole( xfs_inode_t *ip, /* xfs incore inode pointer */ + int whichfork, struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) + xfs_fsblock_t startblock, + bool moretocome) { __int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ @@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole( else out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!moretocome && + xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole( return 1; } +/* Adjust the reported bmap around shared/unshared extent transitions. */ +STATIC int +xfs_getbmap_adjust_shared( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *map, + struct getbmapx *out, + struct xfs_bmbt_irec *next_map) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t ebno; + xfs_extlen_t elen; + xfs_extlen_t nlen; + int error; + + next_map->br_startblock = NULLFSBLOCK; + next_map->br_startoff = NULLFILEOFF; + next_map->br_blockcount = 0; + + /* Only written data blocks can be shared. */ + if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || + map->br_startblock == DELAYSTARTBLOCK || + map->br_startblock == HOLESTARTBLOCK || + ISUNWRITTEN(map)) + return 0; + + agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); + error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, + &ebno, &elen, true); + if (error) + return error; + + if (ebno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (agbno == ebno) { + /* + * Shared extent at (agbno, elen). Shrink the reported + * extent length and prepare to move the start of map[i] + * to agbno+elen, with the aim of (re)formatting the new + * map[i] the next time through the inner loop. + */ + out->bmv_length = XFS_FSB_TO_BB(mp, elen); + out->bmv_oflags |= BMV_OF_SHARED; + if (elen != map->br_blockcount) { + *next_map = *map; + next_map->br_startblock += elen; + next_map->br_startoff += elen; + next_map->br_blockcount -= elen; + } + map->br_blockcount -= elen; + } else { + /* + * There's an unshared extent (agbno, ebno - agbno) + * followed by shared extent at (ebno, elen). Shrink + * the reported extent length to cover only the unshared + * extent and prepare to move up the start of map[i] to + * ebno, with the aim of (re)formatting the new map[i] + * the next time through the inner loop. + */ + *next_map = *map; + nlen = ebno - agbno; + out->bmv_length = XFS_FSB_TO_BB(mp, nlen); + next_map->br_startblock += nlen; + next_map->br_startoff += nlen; + next_map->br_blockcount -= nlen; + map->br_blockcount -= nlen; + } + + return 0; +} + /* * Get inode's extents as described in bmv, and format for output. * Calls formatter to fill the user's buffer until all extents @@ -459,12 +540,28 @@ xfs_getbmap( int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ int cur_ext = 0; + struct xfs_bmbt_irec inject_map; mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - if (whichfork == XFS_ATTR_FORK) { +#ifndef DEBUG + /* Only allow CoW fork queries if we're debugging. */ + if (iflags & BMV_IF_COWFORK) + return -EINVAL; +#endif + if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) + return -EINVAL; + + if (iflags & BMV_IF_ATTRFORK) + whichfork = XFS_ATTR_FORK; + else if (iflags & BMV_IF_COWFORK) + whichfork = XFS_COW_FORK; + else + whichfork = XFS_DATA_FORK; + + switch (whichfork) { + case XFS_ATTR_FORK: if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && @@ -480,7 +577,20 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; - } else { + break; + case XFS_COW_FORK: + if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) + return -EINVAL; + + if (xfs_get_cowextsz_hint(ip)) { + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + break; + default: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -494,6 +604,7 @@ xfs_getbmap( prealloced = 0; fixlen = XFS_ISIZE(ip); } + break; } if (bmv->bmv_length == -1) { @@ -520,7 +631,8 @@ xfs_getbmap( return -ENOMEM; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK) { + switch (whichfork) { + case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -538,8 +650,14 @@ xfs_getbmap( } lock = xfs_ilock_data_map_shared(ip); - } else { + break; + case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + break; + case XFS_ATTR_FORK: lock = xfs_ilock_attr_map_shared(ip); + break; } /* @@ -581,7 +699,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + for (i = 0; i < nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -614,9 +733,16 @@ xfs_getbmap( goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) + /* Is this a shared block? */ + error = xfs_getbmap_adjust_shared(ip, whichfork, + &map[i], &out[cur_ext], &inject_map); + if (error) + goto out_free_map; + + if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, + &out[cur_ext], prealloced, bmvend, + map[i].br_startblock, + inject_map.br_startblock != NULLFSBLOCK)) goto out_free_map; bmv->bmv_offset = @@ -636,11 +762,16 @@ xfs_getbmap( continue; } - nexleft--; + if (inject_map.br_startblock != NULLFSBLOCK) { + map[i] = inject_map; + i--; + } else + nexleft--; bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length); + } while (nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count); out_free_map: kmem_free(map); @@ -1433,8 +1564,8 @@ xfs_insert_file_space( */ static int xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip) /* tmp inode */ { /* Should never get a local format */ @@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format( return -EINVAL; /* + * If we have to use the (expensive) rmap swap method, we can + * handle any number of extents and any format. + */ + if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + return 0; + + /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. @@ -1518,125 +1656,161 @@ xfs_swap_extent_flush( return 0; } -int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) +/* + * Move extents from one file to another, when rmap is enabled. + */ +STATIC int +xfs_swap_extent_rmap( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tip) { - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - int lock_flags; - - /* XXX: we can't do this with rmap, will fix later */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - return -EOPNOTSUPP; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = -ENOMEM; - goto out; - } + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_bmbt_irec tirec; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + __uint64_t tip_flags2; /* - * Lock the inodes against other IO, page faults and truncate to - * begin with. Then we can ensure the inodes are flushed and have no - * page cache safely. Once we have done this we can take the ilocks and - * do the rest of the checks. + * If the source file has shared blocks, we must flag the donor + * file as having shared blocks so that we get the shared-block + * rmap functions when we go to fix up the rmaps. The flags + * will be switch for reals later. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); - - /* Verify that both files have the same format */ - if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { - error = -EINVAL; - goto out_unlock; - } + tip_flags2 = tip->i_d.di_flags2; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) + tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + + offset_fsb = 0; + end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + while (count_fsb) { + /* Read extent from the donor file */ + nimaps = 1; + error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, + &nimaps, 0); + if (error) + goto out; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); + + trace_xfs_swap_extent_rmap_remap(tip, &tirec); + ilen = tirec.br_blockcount; + + /* Unmap the old blocks in the source file. */ + while (tirec.br_blockcount) { + xfs_defer_init(&dfops, &firstfsb); + trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); + + /* Read extent from the source file */ + nimaps = 1; + error = xfs_bmapi_read(ip, tirec.br_startoff, + tirec.br_blockcount, &irec, + &nimaps, 0); + if (error) + goto out_defer; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startoff == irec.br_startoff); + trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); + + /* Trim the extent. */ + uirec = tirec; + uirec.br_blockcount = rlen = min_t(xfs_filblks_t, + tirec.br_blockcount, + irec.br_blockcount); + trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + + /* Remove the mapping from the donor file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + tip, &uirec); + if (error) + goto out_defer; - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = -EINVAL; - goto out_unlock; - } + /* Remove the mapping from the source file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + ip, &irec); + if (error) + goto out_defer; - error = xfs_swap_extent_flush(ip); - if (error) - goto out_unlock; - error = xfs_swap_extent_flush(tip); - if (error) - goto out_unlock; + /* Map the donor file's blocks into the source file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - goto out_unlock; + /* Map the source file's blocks into the donor file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + tip, &irec); + if (error) + goto out_defer; - /* - * Lock and join the inodes to the tansaction so that transaction commit - * or cancel will unlock the inodes from this point onwards. - */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) + goto out_defer; + tirec.br_startoff += rlen; + if (tirec.br_startblock != HOLESTARTBLOCK && + tirec.br_startblock != DELAYSTARTBLOCK) + tirec.br_startblock += rlen; + tirec.br_blockcount -= rlen; + } - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = -EFAULT; - goto out_trans_cancel; + /* Roll on... */ + count_fsb -= ilen; + offset_fsb += ilen; } - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); + tip->i_d.di_flags2 = tip_flags2; + return 0; - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_trans_cancel; - } +out_defer: + xfs_defer_cancel(&dfops); +out: + trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); + tip->i_d.di_flags2 = tip_flags2; + return error; +} + +/* Swap the extents of two files by swapping data forks. */ +STATIC int +xfs_swap_extent_forks( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_inode *tip, + int *src_log_flags, + int *target_log_flags) +{ + struct xfs_ifork tempifp, *ifp, *tifp; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int error; - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = -EBUSY; - goto out_trans_cancel; - } /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + &aforkblks); if (error) - goto out_trans_cancel; + return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); + &taforkblks); if (error) - goto out_trans_cancel; + return error; } /* @@ -1645,31 +1819,23 @@ xfs_swap_extents( * buffers, and so the validation done on read will expect the owner * field to be correctly set. Once we change the owners, we can swap the * inode forks. - * - * Note the trickiness in setting the log flags - we set the owner log - * flag on the opposite inode (i.e. the inode we are setting the new - * owner to be) because once we swap the forks and log that, log - * recovery is going to see the fork as owned by the swapped inode, - * not the pre-swapped inodes. */ - src_log_flags = XFS_ILOG_CORE; - target_log_flags = XFS_ILOG_CORE; if (ip->i_d.di_version == 3 && ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - target_log_flags |= XFS_ILOG_DOWNER; + (*target_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } if (tip->i_d.di_version == 3 && tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - src_log_flags |= XFS_ILOG_DOWNER; + (*src_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } /* @@ -1677,9 +1843,9 @@ xfs_swap_extents( */ ifp = &ip->i_df; tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ + tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ + *tifp = tempifp; /* struct copy */ /* * Fix the on-disk inode values @@ -1719,12 +1885,12 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - src_log_flags |= XFS_ILOG_DEXT; + (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(ip->i_d.di_version < 3 || - (src_log_flags & XFS_ILOG_DOWNER)); - src_log_flags |= XFS_ILOG_DBROOT; + (*src_log_flags & XFS_ILOG_DOWNER)); + (*src_log_flags) |= XFS_ILOG_DBROOT; break; } @@ -1738,15 +1904,166 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - target_log_flags |= XFS_ILOG_DEXT; + (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; + (*target_log_flags) |= XFS_ILOG_DBROOT; ASSERT(tip->i_d.di_version < 3 || - (target_log_flags & XFS_ILOG_DOWNER)); + (*target_log_flags & XFS_ILOG_DOWNER)); break; } + return 0; +} + +int +xfs_swap_extents( + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip, /* tmp inode */ + struct xfs_swapext *sxp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bstat *sbp = &sxp->sx_stat; + int src_log_flags, target_log_flags; + int error = 0; + int lock_flags; + struct xfs_ifork *cowfp; + __uint64_t f; + int resblks; + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + /* + * Extent "swapping" with rmap requires a permanent reservation and + * a block reservation because it's really just a remap operation + * performed with log redo items! + */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Conceptually this shouldn't affect the shape of either + * bmbt, but since we atomically move extents one by one, + * we reserve enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), + XFS_DATA_FORK) + + XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), + XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + 0, 0, &tp); + } else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, + 0, 0, &tp); + if (error) + goto out_unlock; + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tip, 0); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + + /* + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + error = xfs_swap_extent_rmap(&tp, ip, tip); + else + error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, + &target_log_flags); + if (error) + goto out_trans_cancel; + + /* Do we have to swap reflink flags? */ + if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ + (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { + f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + cowfp = ip->i_cowfp; + ip->i_cowfp = tip->i_cowfp; + tip->i_cowfp = cowfp; + xfs_inode_set_cowblocks_tag(ip); + xfs_inode_set_cowblocks_tag(tip); + } + xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); @@ -1761,16 +2078,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; -out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); - goto out; + return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + return error; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f44f79996978..29816981b50a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -84,7 +84,8 @@ xfs_dir2_sf_getdents( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; /* * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3d224702fbc0..05f8666733a0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_BMAPIFORMAT 21 #define XFS_ERRTAG_FREE_EXTENT 22 #define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_MAX 24 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +#define XFS_ERRTAG_MAX 28 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #define XFS_RANDOM_FREE_EXTENT 1 #define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 #ifdef DEBUG extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2bc58b3fd37d..a314fc7b56fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" #include "xfs_iomap.h" +#include "xfs_reflink.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -634,6 +635,13 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + /* If this is a block-aligned directio CoW, remap immediately. */ + if (xfs_is_reflink_inode(ip) && !unaligned_io) { + ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); + if (ret) + goto out; + } + data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -735,6 +743,9 @@ write_retry: enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; @@ -774,10 +785,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) + else if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Allow a directio write to fall back to a buffered + * write *only* in the case that we're doing a reflink + * CoW. In all other directio scenarios we do not + * allow an operation to fall back to buffered mode. + */ ret = xfs_file_dio_aio_write(iocb, from); - else + if (ret == -EREMCHG) + goto buffered; + } else { +buffered: ret = xfs_file_buffered_aio_write(iocb, from); + } if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -791,7 +812,7 @@ xfs_file_write_iter( #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE) + FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long xfs_file_fallocate( @@ -881,9 +902,15 @@ xfs_file_fallocate( if (mode & FALLOC_FL_ZERO_RANGE) error = xfs_zero_file_space(ip, offset, len); - else + else { + if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + } error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + } if (error) goto out_unlock; } @@ -920,6 +947,189 @@ out_unlock: return error; } +/* + * Flush all file writes out to disk. + */ +static int +xfs_file_wait_for_io( + struct inode *inode, + loff_t offset, + size_t len) +{ + loff_t rounding; + loff_t ioffset; + loff_t iendoffset; + loff_t bs; + int ret; + + bs = inode->i_sb->s_blocksize; + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, bs, PAGE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + iendoffset); + return ret; +} + +/* Hook up to the VFS reflink function */ +STATIC int +xfs_file_share_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in; + struct inode *inode_out; + ssize_t ret; + loff_t bs; + loff_t isize; + int same_inode; + loff_t blen; + unsigned int flags = 0; + + inode_in = file_inode(file_in); + inode_out = file_inode(file_out); + bs = inode_out->i_sb->s_blocksize; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + if (IS_SWAPFILE(inode_in) || + IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Reflink only works within this filesystem. */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + same_inode = (inode_in->i_ino == inode_out->i_ino); + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + return -EINVAL; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) + return 0; + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + + /* Wait for the completion of any pending IOs on srcfile */ + ret = xfs_file_wait_for_io(inode_in, pos_in, len); + if (ret) + goto out; + ret = xfs_file_wait_for_io(inode_out, pos_out, len); + if (ret) + goto out; + + if (is_dedupe) + flags |= XFS_REFLINK_DEDUPE; + ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), + pos_out, len, flags); + if (ret < 0) + goto out; + +out: + return ret; +} + +STATIC ssize_t +xfs_file_copy_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len, + unsigned int flags) +{ + int error; + + error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); + if (error) + return error; + return len; +} + +STATIC int +xfs_file_clone_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( + struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > XFS_MAX_DEDUPE_LEN) + len = XFS_MAX_DEDUPE_LEN; + + error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} STATIC int xfs_file_open( @@ -1581,6 +1791,9 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .copy_file_range = xfs_file_copy_range, + .clone_file_range = xfs_file_clone_range, + .dedupe_file_range = xfs_file_dedupe_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94ac06f3d908..93d12fa2670d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -43,6 +43,7 @@ #include "xfs_log.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" /* * File system operations @@ -108,7 +109,9 @@ xfs_fs_geometry( (xfs_sb_version_hassparseinodes(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | + (xfs_sb_version_hasreflink(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_REFLINK : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -259,6 +262,12 @@ xfs_growfs_data_private( agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -450,6 +459,17 @@ xfs_growfs_data_private( rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32( + xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -507,6 +527,28 @@ xfs_growfs_data_private( goto error0; } + /* + * refcount btree root block + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_refcountbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -589,6 +631,11 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* Reserve AG metadata blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out; + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = 0; @@ -639,6 +686,8 @@ xfs_growfs_data_private( continue; } } + + out: return saved_error ? saved_error : error; error0: @@ -948,3 +997,59 @@ xfs_do_force_shutdown( "Please umount the filesystem and rectify the problem(s)"); } } + +/* + * Reserve free space for per-AG metadata. + */ +int +xfs_fs_reserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_init(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error && error != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving per-AG metadata reserve pool.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + + return error; +} + +/* + * Free space reserved for per-AG metadata. + */ +int +xfs_fs_unreserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error) + xfs_warn(mp, + "Error %d freeing per-AG metadata reserve pool.", error); + + return error; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f14f9a..f34915898fea 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); + #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d41b241298f..687a4b01fc53 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,8 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer, which is measured in - * seconds. + * 100ths of a second) with the exception of eofb_timer and cowb_timer, which + * are measured in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -42,6 +42,7 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, + .cowb_timer = { 1, 1800, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 65b2e3f85f52..14796b744e0a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,7 @@ #include "xfs_bmap_util.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -76,6 +77,9 @@ xfs_inode_alloc( ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; + ip->i_cowfp = NULL; + ip->i_cnextents = 0; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -101,6 +105,8 @@ xfs_inode_free_callback( if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); + if (ip->i_cowfp) + xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); @@ -787,6 +793,33 @@ xfs_eofblocks_worker( xfs_queue_eofblocks(mp); } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +STATIC void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + xfs_icache_free_cowblocks(mp, NULL); + xfs_queue_cowblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks( return ret; } -int -xfs_icache_free_eofblocks( +static int +__xfs_icache_free_eofblocks( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_eofblocks *eofb, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int tag) { int flags = SYNC_TRYLOCK; if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) flags = SYNC_WAIT; - return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - eofb, XFS_ICI_EOFBLOCKS_TAG); + return xfs_inode_ag_iterator_tag(mp, execute, flags, + eofb, tag); +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); } /* @@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks( * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) +static int +__xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip, + int (*execute)(struct xfs_mount *mp, + struct xfs_eofblocks *eofb)) { int scan = 0; struct xfs_eofblocks eofb = {0}; @@ -1401,14 +1448,25 @@ xfs_inode_free_quota_eofblocks( } if (scan) - xfs_icache_free_eofblocks(ip->i_mount, &eofb); + execute(ip->i_mount, &eofb); return scan; } -void -xfs_inode_set_eofblocks_tag( - xfs_inode_t *ip) +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); +} + +static void +__xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip, + void (*execute)(struct xfs_mount *mp), + void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1426,26 +1484,22 @@ xfs_inode_set_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_set_eofblocks_tag(ip); - tagged = radix_tree_tagged(&pag->pag_ici_root, - XFS_ICI_EOFBLOCKS_TAG); + tagged = radix_tree_tagged(&pag->pag_ici_root, tag); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); if (!tagged) { /* propagate the eofblocks tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_queue_eofblocks(ip->i_mount); + execute(ip->i_mount); - trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1453,9 +1507,22 @@ xfs_inode_set_eofblocks_tag( } void -xfs_inode_clear_eofblocks_tag( +xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); +} + +static void +__xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip, + void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) +{ struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1465,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_clear_eofblocks_tag(ip); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { /* clear the eofblocks tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_reflink_has_real_cow_blocks(ip)) { + trace_xfs_inode_free_cowblocks_invalid(ip); + xfs_inode_clear_cowblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty or under writeback we cannot touch the + * CoW fork. Leave it alone if we're in the midst of a directio. + */ + if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || + atomic_read(&VFS_I(ip)->i_dio_count)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + /* Free the CoW blocks */ + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + } + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + + if (need_iolock) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return ret; +} + +int +xfs_icache_free_cowblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +int +xfs_inode_free_quota_cowblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); +} + +void +xfs_inode_set_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +void +xfs_inode_clear_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 05bac99bef75..a1e02f4708ab 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,6 +40,7 @@ struct xfs_eofblocks { in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ +#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ /* * Flags for xfs_iget() @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); +void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); +void xfs_cowblocks_worker(struct work_struct *); + int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 624e1dfa716b..4e560e6a12c1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" kmem_zone_t *xfs_inode_zone; @@ -77,6 +78,29 @@ xfs_get_extsz_hint( } /* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_d.di_cowextsize; + b = xfs_get_extsz_hint(ip); + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} + +/* * These two are wrapper routines around the xfs_ilock() routine used to * centralize some grungy code. They are used in places that wish to lock the * inode solely for reading the extents. The reason these places can't just @@ -651,6 +675,8 @@ _xfs_dic2xflags( if (di_flags2 & XFS_DIFLAG2_ANY) { if (di_flags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; + if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; } if (has_attr) @@ -834,6 +860,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { inode->i_version = 1; ip->i_d.di_flags2 = 0; + ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; } @@ -896,6 +923,15 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; ip->i_d.di_flags2 |= di_flags2; } + if (pip && + (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && + pip->i_d.di_version == 3 && + ip->i_d.di_version == 3) { + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + } /* FALLTHROUGH */ case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -1586,6 +1622,20 @@ xfs_itruncate_extents( goto out; } + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, + last_block); + if (error) + goto out; + + /* + * Clear the reflink flag if we truncated everything. + */ + if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + } + /* * Always re-log the inode so that our permanent transaction can keep * on rolling it forward in the log. @@ -1850,6 +1900,7 @@ xfs_inactive( } mp = ip->i_mount; + ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f30d2533b48..f14c1de2549d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -47,6 +47,7 @@ typedef struct xfs_inode { /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t *i_cowfp; /* copy on write extents */ xfs_ifork_t i_df; /* data fork */ /* operations vectors */ @@ -65,6 +66,9 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ + xfs_extnum_t i_cnextents; /* # of extents in cow fork */ + unsigned int i_cformat; /* format of cow fork */ + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; @@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp) return XFS_PROJID_DEFAULT; } +static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +} + /* * In-core inode flags. */ @@ -217,6 +226,12 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ #define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +/* + * If this unlinked inode is in the middle of recovery, don't let drop_inode + * truncate and free the inode. This can happen if we iget the inode during + * log recovery to replay a bmap operation on the inode. + */ +#define XFS_IRECOVERY (1 << 11) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -411,6 +426,7 @@ int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, @@ -474,4 +490,7 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* The default CoW extent size hint. */ +#define XFS_DEFAULT_COWEXTSZ_HINT 32 + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 892c2aced207..9610e9c00952 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -368,7 +368,7 @@ xfs_inode_to_log_dinode( to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; - + to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0d9021f0551e..c245bed3249b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_cowextsize = ip->i_d.di_cowextsize << + ip->i_mount->m_sb.sb_blocklog; fa.fsx_projid = xfs_get_projid(ip); if (attr) { @@ -973,12 +975,13 @@ xfs_set_diflags( if (ip->i_d.di_version < 3) return; - di_flags2 = 0; + di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_flags2 = di_flags2; - } STATIC void @@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags( return -EINVAL; } + /* Clear reflink if we are actually able to set the rt flag. */ + if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + + /* Don't allow us to set DAX mode for a reflinked file for now. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) + return -EINVAL; + /* * Can't modify an immutable/append-only file unless * we have appropriate permission. @@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize( return 0; } +/* + * CoW extent size hint validation rules are: + * + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. CoW extsize hint of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. + */ +static int +xfs_ioctl_setattr_check_cowextsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) + return 0; + + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || + ip->i_d.di_version != 3) + return -EINVAL; + + if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) + return -EINVAL; + + if (fa->fsx_cowextsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t cowextsize_fsb; + + cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + if (cowextsize_fsb > MAXEXTLEN) + return -EINVAL; + + size = mp->m_sb.sb_blocksize; + if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + + if (fa->fsx_cowextsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, @@ -1311,6 +1372,10 @@ xfs_ioctl_setattr( if (code) goto error_trans_cancel; + code = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (code) + goto error_trans_cancel; + code = xfs_ioctl_setattr_xflags(tp, ip, fa); if (code) goto error_trans_cancel; @@ -1346,6 +1411,12 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; + if (ip->i_d.di_version == 3 && + (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + ip->i_d.di_cowextsize = fa->fsx_cowextsize >> + mp->m_sb.sb_blocklog; + else + ip->i_d.di_cowextsize = 0; code = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index c08253e11545..d907eb9f8ef3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -39,6 +39,7 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -70,7 +71,7 @@ xfs_bmbt_to_iomap( iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); } -static xfs_extlen_t +xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, xfs_extlen_t extsize) @@ -609,7 +610,7 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, offset_fsb, + error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, end_fsb - offset_fsb, &got, &prev, &idx, eof); switch (error) { @@ -666,6 +667,7 @@ out_unlock: int xfs_iomap_write_allocate( xfs_inode_t *ip, + int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap) { @@ -678,8 +680,12 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; + int flags = 0; int nres; + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK; + /* * Make sure that the dquots are there. */ @@ -773,7 +779,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, + count_fsb, flags, &first_block, nres, imap, &nimaps, &dfops); if (error) @@ -955,14 +961,22 @@ xfs_file_iomap_begin( struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; + bool shared, trimmed; int nimaps = 1, error = 0; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow_range(ip, offset, length); + if (error < 0) + return error; + } + + if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && + !xfs_get_extsz_hint(ip)) { + /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } @@ -976,7 +990,14 @@ xfs_file_iomap_begin( end_fsb = XFS_B_TO_FSB(mp, offset + length); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE); + &nimaps, 0); + if (error) { + xfs_iunlock(ip, lockmode); + return error; + } + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) { xfs_iunlock(ip, lockmode); return error; @@ -1015,6 +1036,8 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + if (shared) + iomap->flags |= IOMAP_F_SHARED; return 0; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6498be485932..6d45cf01fcff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,12 +25,13 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, +int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); extern struct iomap_ops xfs_iomap_ops; extern struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5da95eb79b8..405a65cd9d6b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1159,6 +1159,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && + !xfs_is_reflink_inode(ip) && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ce73eb34620d..66e881790c17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,7 +66,7 @@ xfs_bulkstat_one_int( if (!buffer || xfs_internal_inum(mp, ino)) return -EINVAL; - buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); if (!buf) return -ENOMEM; @@ -111,6 +111,12 @@ xfs_bulkstat_one_int( buf->bs_aextents = dic->di_anextents; buf->bs_forkoff = XFS_IFORK_BOFF(ip); + if (dic->di_version == 3) { + if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + buf->bs_cowextsize = dic->di_cowextsize << + mp->m_sb.sb_blocklog; + } + switch (dic->di_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = ip->i_df.if_u2.if_rdev; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index b8d64d520e12..68640fb63a54 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,6 +116,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_eofb_secs xfs_params.eofb_timer.val +#define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 846483d56949..9b3d7c76915d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -45,6 +45,8 @@ #include "xfs_dir2.h" #include "xfs_rmap_item.h" #include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -1924,6 +1926,10 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2242,6 +2248,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2415,6 +2422,9 @@ xlog_recover_validate_buf_type( case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -3547,6 +3557,242 @@ xlog_recover_rud_pass2( } /* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The CUI has two references. One for the CUD and one for CUI to ensure + * it makes it into the AIL. Insert the CUI into the AIL directly and + * drop the CUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_cud_log_format *cud_formatp; + struct xfs_cui_log_item *cuip = NULL; + struct xfs_log_item *lip; + __uint64_t cui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + return -EFSCORRUPTED; + cui_id = cud_formatp->cud_cui_id; + + /* + * Search for the CUI with the id in the CUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_CUI) { + cuip = (struct xfs_cui_log_item *)lip; + if (cuip->cui_format.cui_id == cui_id) { + /* + * Drop the CUD reference to the CUI. This + * removes the CUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + return -EFSCORRUPTED; + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_bud_log_format *bud_formatp; + struct xfs_bui_log_item *buip = NULL; + struct xfs_log_item *lip; + __uint64_t bui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + return -EFSCORRUPTED; + bui_id = bud_formatp->bud_bui_id; + + /* + * Search for the BUI with the id in the BUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_BUI) { + buip = (struct xfs_bui_log_item *)lip; + if (buip->bui_format.bui_id == bui_id) { + /* + * Drop the BUD reference to the BUI. This + * removes the BUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that @@ -3773,6 +4019,10 @@ xlog_recover_ra_pass2( case XFS_LI_QUOTAOFF: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: default: break; } @@ -3798,6 +4048,10 @@ xlog_recover_commit_pass1( case XFS_LI_ICREATE: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: /* nothing to do in pass 1 */ return 0; default: @@ -3832,6 +4086,14 @@ xlog_recover_commit_pass2( return xlog_recover_rui_pass2(log, item, trans->r_lsn); case XFS_LI_RUD: return xlog_recover_rud_pass2(log, item); + case XFS_LI_CUI: + return xlog_recover_cui_pass2(log, item, trans->r_lsn); + case XFS_LI_CUD: + return xlog_recover_cud_pass2(log, item); + case XFS_LI_BUI: + return xlog_recover_bui_pass2(log, item, trans->r_lsn); + case XFS_LI_BUD: + return xlog_recover_bud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -4419,12 +4681,94 @@ xlog_recover_cancel_rui( spin_lock(&ailp->xa_lock); } +/* Recover the CUI if necessary. */ +STATIC int +xlog_recover_process_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + int error; + + /* + * Skip CUIs that we've already processed. + */ + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_cui_recover(mp, cuip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the CUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the BUI if necessary. */ +STATIC int +xlog_recover_process_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + int error; + + /* + * Skip BUIs that we've already processed. + */ + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_bui_recover(mp, buip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the BUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); +} + /* Is this log item a deferred action intent? */ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { switch (lip->li_type) { case XFS_LI_EFI: case XFS_LI_RUI: + case XFS_LI_CUI: + case XFS_LI_BUI: return true; default: return false; @@ -4488,6 +4832,12 @@ xlog_recover_process_intents( case XFS_LI_RUI: error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + error = xlog_recover_process_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + error = xlog_recover_process_bui(log->l_mp, ailp, lip); + break; } if (error) goto out; @@ -4535,6 +4885,12 @@ xlog_recover_cancel_intents( case XFS_LI_RUI: xlog_recover_cancel_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + xlog_recover_cancel_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + xlog_recover_cancel_bui(log->l_mp, ailp, lip); + break; } lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4613,6 +4969,7 @@ xlog_recover_process_one_iunlink( if (error) goto fail_iput; + xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56e85a6c85c7..fc7873942bea 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_reflink.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -684,6 +686,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); xfs_rmapbt_compute_maxlevels(mp); + xfs_refcountbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -923,6 +926,15 @@ xfs_mountfs( } /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. + */ + mp->m_super->s_flags |= MS_ACTIVE; + + /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -974,10 +986,28 @@ xfs_mountfs( if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_quota; + } + + /* Reserve AG blocks for future btree expansion. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out_agresv; } return 0; + out_agresv: + xfs_fs_unreserve_ag_blocks(mp); + out_quota: + xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1019,7 +1049,9 @@ xfs_unmountfs( int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); + xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 041d9493e798..819b80b15bfb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -124,10 +124,13 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ + uint m_refc_mxr[2]; /* max refc btree records */ + uint m_refc_mnr[2]; /* min refc btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ uint m_rmap_maxlevels; /* max rmap btree levels */ + uint m_refc_maxlevels; /* max refcount btree level */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ @@ -161,6 +164,8 @@ typedef struct xfs_mount { struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ + struct delayed_work m_cowblocks_work; /* background cow blocks + trimming */ bool m_update_sb; /* sb needs update in mount */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -399,6 +404,9 @@ typedef struct xfs_perag { struct xfs_ag_resv pag_meta_resv; /* Blocks reserved for just AGFL-based metadata. */ struct xfs_ag_resv pag_agfl_resv; + + /* reference count */ + __uint8_t pagf_refcount_level; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 69e2986a3776..0c381d71b242 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); @@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0f14b2e4bf6c..93a7aafa56d6 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -114,6 +114,13 @@ xfs_fs_map_blocks( return -ENXIO; /* + * The pNFS block layout spec actually supports reflink like + * functionality, but the Linux pNFS server doesn't implement it yet. + */ + if (xfs_is_reflink_inode(ip)) + return -ENXIO; + + /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c new file mode 100644 index 000000000000..fe86a668a57e --- /dev/null +++ b/fs/xfs/xfs_refcount_item.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_log.h" +#include "xfs_refcount.h" + + +kmem_zone_t *xfs_cui_zone; +kmem_zone_t *xfs_cud_zone; + +static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cui_log_item, cui_item); +} + +void +xfs_cui_item_free( + struct xfs_cui_log_item *cuip) +{ + if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) + kmem_free(cuip); + else + kmem_zone_free(xfs_cui_zone, cuip); +} + +STATIC void +xfs_cui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cui log item. We use only 1 iovec, and we point that + * at the cui_log_format structure embedded in the cui item. + * It is at this point that we assert that all of the extent + * slots in the cui item have been filled. + */ +STATIC void +xfs_cui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&cuip->cui_next_extent) == + cuip->cui_format.cui_nextents); + + cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, + xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents)); +} + +/* + * Pinning has no meaning for an cui item, so just return. + */ +STATIC void +xfs_cui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an CUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the CUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the CUI to either construct + * and commit the CUD or drop the CUD's reference in the event of error. Simply + * drop the log's CUI reference now that the log is done with it. + */ +STATIC void +xfs_cui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + xfs_cui_release(cuip); +} + +/* + * CUI items have no locking or pushing. However, since CUIs are pulled from + * the AIL when their corresponding CUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the CUI out of + * the AIL. + */ +STATIC uint +xfs_cui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an CUD isn't going to be + * constructed and thus we free the CUI here directly. + */ +STATIC void +xfs_cui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_cui_item_free(CUI_ITEM(lip)); +} + +/* + * The CUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_cui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The CUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cui log items. + */ +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_pin = xfs_cui_item_pin, + .iop_unpin = xfs_cui_item_unpin, + .iop_unlock = xfs_cui_item_unlock, + .iop_committed = xfs_cui_item_committed, + .iop_push = xfs_cui_item_push, + .iop_committing = xfs_cui_item_committing, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + KM_SLEEP); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} + +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + +static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cud_log_item, cud_item); +} + +STATIC void +xfs_cud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_cud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cud log item. We use only 1 iovec, and we point that + * at the cud_log_format structure embedded in the cud item. + * It is at this point that we assert that all of the extent + * slots in the cud item have been filled. + */ +STATIC void +xfs_cud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + cudp->cud_format.cud_type = XFS_LI_CUD; + cudp->cud_format.cud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, + sizeof(struct xfs_cud_log_format)); +} + +/* + * Pinning has no meaning for an cud item, so just return. + */ +STATIC void +xfs_cud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an cud item, unpinning does + * not either. + */ +STATIC void +xfs_cud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an cud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_cud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. + */ +STATIC void +xfs_cud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + } +} + +/* + * When the cud item is committed to disk, all we need to do is delete our + * reference to our partner cui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_cud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + /* + * Drop the CUI reference regardless of whether the CUD has been + * aborted. Once the CUD transaction is constructed, it is the sole + * responsibility of the CUD to release the CUI (even if the CUI is + * aborted due to log I/O error). + */ + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + + return (xfs_lsn_t)-1; +} + +/* + * The CUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cud log items. + */ +static const struct xfs_item_ops xfs_cud_item_ops = { + .iop_size = xfs_cud_item_size, + .iop_format = xfs_cud_item_format, + .iop_pin = xfs_cud_item_pin, + .iop_unpin = xfs_cud_item_unpin, + .iop_unlock = xfs_cud_item_unlock, + .iop_committed = xfs_cud_item_committed, + .iop_push = xfs_cud_item_push, + .iop_committing = xfs_cud_item_committing, +}; + +/* + * Allocate and initialize an cud item with the given number of extents. + */ +struct xfs_cud_log_item * +xfs_cud_init( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) + +{ + struct xfs_cud_log_item *cudp; + + cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return cudp; +} + +/* + * Process a refcount update intent item that was recovered from the log. + * We need to update the refcountbt. + */ +int +xfs_cui_recover( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) +{ + int i; + int error = 0; + unsigned int refc_type; + struct xfs_phys_extent *refc; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_cud_log_item *cudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + enum xfs_refcount_intent_type type; + xfs_fsblock_t firstfsb; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_len; + struct xfs_bmbt_irec irec; + struct xfs_defer_ops dfops; + bool requeue_only = false; + + ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + + /* + * First check the validity of the extents described by the + * CUI. If any are bad, then assume that all are bad and + * just toss the CUI. + */ + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, refc->pe_startblock)); + switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + refc->pe_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + refc->pe_len >= mp->m_sb.sb_agblocks || + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { + /* + * This will pull the CUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + xfs_cui_release(cuip); + return -EIO; + } + } + + /* + * Under normal operation, refcount updates are deferred, so we + * wouldn't be adding them directly to a transaction. All + * refcount updates manage reservation usage internally and + * dynamically by deferring work that won't fit in the + * transaction. Normally, any work that needs to be deferred + * gets attached to the same defer_ops that scheduled the + * refcount update. However, we're in log recovery here, so we + * we create our own defer_ops and use that to finish up any + * work that doesn't fit. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + cudp = xfs_trans_get_cud(tp, cuip); + + xfs_defer_init(&dfops, &firstfsb); + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + switch (refc_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + type = refc_type; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + if (requeue_only) { + new_fsb = refc->pe_startblock; + new_len = refc->pe_len; + } else + error = xfs_trans_log_finish_refcount_update(tp, cudp, + &dfops, type, refc->pe_startblock, refc->pe_len, + &new_fsb, &new_len, &rcur); + if (error) + goto abort_error; + + /* Requeue what we didn't finish. */ + if (new_len > 0) { + irec.br_startblock = new_fsb; + irec.br_blockcount = new_len; + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_increase_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_decrease_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = xfs_refcount_alloc_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + case XFS_REFCOUNT_FREE_COW: + error = xfs_refcount_free_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + default: + ASSERT(0); + } + if (error) + goto abort_error; + requeue_only = true; + } + } + + xfs_refcount_finish_one_cleanup(tp, rcur, error); + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto abort_error; + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_refcount_finish_one_cleanup(tp, rcur, error); + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h new file mode 100644 index 000000000000..5b74dddfa64b --- /dev/null +++ b/fs/xfs/xfs_refcount_item.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_ITEM_H__ +#define __XFS_REFCOUNT_ITEM_H__ + +/* + * There are (currently) two pairs of refcount btree redo item types: + * increase and decrease. The log items for these are CUI (refcount + * update intent) and CUD (refcount update done). The redo item type + * is encoded in the flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same + * transaction that records the associated refcountbt updates. + * + * Should the system crash after the commit of the first transaction + * but before the commit of the final transaction in a series, log + * recovery will use the redo information recorded by the intent items + * to replay the refcountbt metadata updates. + */ + +/* kernel only CUI/CUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_CUI_MAX_FAST_EXTENTS 16 + +/* + * Define CUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_CUI_RECOVERED 1 + +/* + * This is the "refcount update intent" log item. It is used to log + * the fact that some reverse mappings need to change. It is used in + * conjunction with the "refcount update done" log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; + * see the comments about that structure (in xfs_extfree_item.h) for + * more details. + */ +struct xfs_cui_log_item { + struct xfs_log_item cui_item; + atomic_t cui_refcount; + atomic_t cui_next_extent; + unsigned long cui_flags; /* misc flags */ + struct xfs_cui_log_format cui_format; +}; + +static inline size_t +xfs_cui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_cui_log_item, cui_format) + + xfs_cui_log_format_sizeof(nr); +} + +/* + * This is the "refcount update done" log item. It is used to log the + * fact that some refcountbt updates mentioned in an earlier cui item + * have been performed. + */ +struct xfs_cud_log_item { + struct xfs_log_item cud_item; + struct xfs_cui_log_item *cud_cuip; + struct xfs_cud_log_format cud_format; +}; + +extern struct kmem_zone *xfs_cui_zone; +extern struct kmem_zone *xfs_cud_zone; + +struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); +struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, + struct xfs_cui_log_item *); +void xfs_cui_item_free(struct xfs_cui_log_item *); +void xfs_cui_release(struct xfs_cui_log_item *); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); + +#endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 000000000000..5965e9455d91 --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,1688 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_reflink.h" +#include "xfs_iomap.h" +#include "xfs_rmap_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" + +/* + * Copy on Write of Shared Blocks + * + * XFS must preserve "the usual" file semantics even when two files share + * the same physical blocks. This means that a write to one file must not + * alter the blocks in a different file; the way that we'll do that is + * through the use of a copy-on-write mechanism. At a high level, that + * means that when we want to write to a shared block, we allocate a new + * block, write the data to the new block, and if that succeeds we map the + * new block into the file. + * + * XFS provides a "delayed allocation" mechanism that defers the allocation + * of disk blocks to dirty-but-not-yet-mapped file blocks as long as + * possible. This reduces fragmentation by enabling the filesystem to ask + * for bigger chunks less often, which is exactly what we want for CoW. + * + * The delalloc mechanism begins when the kernel wants to make a block + * writable (write_begin or page_mkwrite). If the offset is not mapped, we + * create a delalloc mapping, which is a regular in-core extent, but without + * a real startblock. (For delalloc mappings, the startblock encodes both + * a flag that this is a delalloc mapping, and a worst-case estimate of how + * many blocks might be required to put the mapping into the BMBT.) delalloc + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * + * When dirty pages are being written out (typically in writepage), the + * delalloc reservations are converted into real mappings by allocating + * blocks and replacing the delalloc mapping with real ones. A delalloc + * mapping can be replaced by several real ones if the free space is + * fragmented. + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar. The first two steps (creating the reservation + * and allocating the blocks) are exactly the same as delalloc except that + * the mappings must be stored in a separate CoW fork because we do not want + * to disturb the mapping in the data fork until we're sure that the write + * succeeded. IO completion in this case is the process of removing the old + * mapping from the data fork and moving the new mapping from the CoW fork to + * the data fork. This will be discussed shortly. + * + * For now, unaligned directio writes will be bounced back to the page cache. + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written. Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd. For each extent, simply + * unmap the corresponding range in the data fork, map the new range into + * the data fork, and remove the extent from the CoW fork. + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend + * instead of declaring a new IO type. This is required for direct io + * because we only have ioend for the whole dio, and we have to be able to + * remember the presence of unwritten blocks and CoW blocks with a single + * ioend structure. Better yet, the more ground we can cover with one + * ioend, the better. + */ + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is true, return the longest contiguous extent of + * shared blocks. If there are no shared extents, fbno and flen will + * be set to NULLAGBLOCK and 0, respectively. + */ +int +xfs_reflink_find_shared( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, + find_end_of_shared); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_buf_relse(agbp); + return error; +} + +/* + * Trim the mapping to the next block where there's a change in the + * shared/unshared status. More specifically, this means that we + * find the lowest-numbered extent of shared blocks that coincides with + * the given block mapping. If the shared extent overlaps the start of + * the mapping, trim the mapping to the end of the shared extent. If + * the shared region intersects the mapping, trim the mapping to the + * start of the shared extent. If there are no shared regions that + * overlap, just return the original extent. + */ +int +xfs_reflink_trim_around_shared( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + bool *shared, + bool *trimmed) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + /* Holes, unwritten, and delalloc extents cannot be shared */ + if (!xfs_is_reflink_inode(ip) || + ISUNWRITTEN(irec) || + irec->br_startblock == HOLESTARTBLOCK || + irec->br_startblock == DELAYSTARTBLOCK) { + *shared = false; + return 0; + } + + trace_xfs_reflink_trim_around_shared(ip, irec); + + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + aglen = irec->br_blockcount; + + error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + aglen, &fbno, &flen, true); + if (error) + return error; + + *shared = *trimmed = false; + if (fbno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (fbno == agbno) { + /* + * The start of this extent is shared. Truncate the + * mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the + * unshared region. + */ + irec->br_blockcount = flen; + *shared = true; + if (flen != aglen) + *trimmed = true; + return 0; + } else { + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + *trimmed = true; + return 0; + } +} + +/* Create a CoW reservation for a range of blocks within a file. */ +static int +__xfs_reflink_reserve_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb, + bool *skipped) +{ + struct xfs_bmbt_irec got, prev, imap; + xfs_fileoff_t orig_end_fsb; + int nimaps, eof = 0, error = 0; + bool shared = false, trimmed = false; + xfs_extnum_t idx; + xfs_extlen_t align; + + /* Already reserved? Skip the refcount btree access. */ + xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= *offset_fsb) { + end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); + if (error) + goto out_unlock; + + end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + *skipped = true; + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the end of + * the extent. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); + if (align) + end_fsb = roundup_64(end_fsb, align); + +retry: + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, + end_fsb - *offset_fsb, &got, + &prev, &idx, eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_reflink_cow_enospc(ip, &imap); + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; + goto retry; + } + /*FALLTHRU*/ + default: + goto out_unlock; + } + + if (end_fsb != orig_end_fsb) + xfs_inode_set_cowblocks_tag(ip); + + trace_xfs_reflink_cow_alloc(ip, &got); +done: + *offset_fsb = end_fsb; +out_unlock: + return error; +} + +/* Create a CoW reservation for part of a file. */ +int +xfs_reflink_reserve_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + bool skipped = false; + int error; + + trace_xfs_reflink_reserve_cow_range(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + while (offset_fsb < end_fsb) { + error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, + &skipped); + if (error) { + trace_xfs_reflink_reserve_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +static int +__xfs_reflink_allocate_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + struct xfs_trans *tp; + xfs_fsblock_t first_block; + xfs_fileoff_t next_fsb; + int nimaps = 1, error; + bool skipped = false; + + xfs_defer_init(&dfops, &first_block); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + next_fsb = *offset_fsb; + error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); + if (error) + goto out_trans_cancel; + + if (skipped) { + *offset_fsb = next_fsb; + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, + XFS_BMAPI_COWFORK, &first_block, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), + &imap, &nimaps, &dfops); + if (error) + goto out_trans_cancel; + + /* We might not have been able to map the whole delalloc extent */ + *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_trans_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* Allocate all CoW reservations covering a part of a file. */ +int +xfs_reflink_allocate_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + int error; + + ASSERT(xfs_is_reflink_inode(ip)); + + trace_xfs_reflink_allocate_cow_range(ip, offset, count); + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + while (offset_fsb < end_fsb) { + error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); + if (error) { + trace_xfs_reflink_allocate_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + + return error; +} + +/* + * Find the CoW reservation (and whether or not it needs block allocation) + * for a given byte offset of a file. + */ +bool +xfs_reflink_find_cow_mapping( + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + bool *need_alloc) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_fileoff_t bno; + xfs_extnum_t idx; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_is_reflink_inode(ip)); + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + bno = XFS_B_TO_FSBT(ip->i_mount, offset); + gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); + if (!gotp) + return false; + + xfs_bmbt_get_all(gotp, &irec); + if (bno >= irec.br_startoff + irec.br_blockcount || + bno < irec.br_startoff) + return false; + + trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, + &irec); + + /* If it's still delalloc, we must allocate later. */ + *imap = irec; + *need_alloc = !!(isnullstartblock(irec.br_startblock)); + + return true; +} + +/* + * Trim an extent to end at the next CoW reservation past offset_fsb. + */ +int +xfs_reflink_trim_irec_to_next_cow( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); + if (!gotp) + return 0; + xfs_bmbt_get_all(gotp, &irec); + + /* This is the extent before; try sliding up one. */ + if (irec.br_startoff < offset_fsb) { + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + return 0; + gotp = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_get_all(gotp, &irec); + } + + if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) + return 0; + + imap->br_blockcount = irec.br_startoff - imap->br_startoff; + trace_xfs_reflink_trim_irec(ip, imap); + + return 0; +} + +/* + * Cancel all pending CoW reservations for some block range of an inode. + */ +int +xfs_reflink_cancel_cow_blocks( + struct xfs_inode *ip, + struct xfs_trans **tpp, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_bmbt_irec irec; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error = 0; + int nimaps; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + break; + ASSERT(nimaps == 1); + + trace_xfs_reflink_cancel_cow(ip, &irec); + + if (irec.br_startblock == DELAYSTARTBLOCK) { + /* Free a delayed allocation. */ + xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, + false); + ip->i_delayed_blks -= irec.br_blockcount; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } else if (irec.br_startblock == HOLESTARTBLOCK) { + /* empty */ + } else { + xfs_trans_ijoin(*tpp, ip, 0); + xfs_defer_init(&dfops, &firstfsb); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(ip->i_mount, + &dfops, irec.br_startblock, + irec.br_blockcount); + if (error) + break; + + xfs_bmap_add_free(ip->i_mount, &dfops, + irec.br_startblock, irec.br_blockcount, + NULL); + + /* Update quota accounting */ + xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, + -(long)irec.br_blockcount); + + /* Roll the transaction */ + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) { + xfs_defer_cancel(&dfops); + break; + } + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } + + /* Roll on... */ + offset_fsb = irec.br_startoff + irec.br_blockcount; + } + + return error; +} + +/* + * Cancel all pending CoW reservations for some byte range of an inode. + */ +int +xfs_reflink_cancel_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error; + + trace_xfs_reflink_cancel_cow_range(ip, offset, count); + ASSERT(xfs_is_reflink_inode(ip)); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (count == NULLFILEOFF) + end_fsb = NULLFILEOFF; + else + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + 0, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Scrape out the old CoW reservations */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + if (error) + goto out_cancel; + + error = xfs_trans_commit(tp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + unsigned int resblks; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + /* Read extent from the source file */ + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + goto out_cancel; + ASSERT(nimaps == 1); + + ASSERT(irec.br_startblock != DELAYSTARTBLOCK); + trace_xfs_reflink_cow_remap(ip, &irec); + + /* + * We can have a hole in the CoW fork if part of a directio + * write is CoW but part of it isn't. + */ + rlen = ilen = irec.br_blockcount; + if (irec.br_startblock == HOLESTARTBLOCK) + goto next_extent; + + /* Unmap the old blocks in the data fork. */ + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, irec.br_startoff, + &rlen, 0, 1, &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec.br_startblock + rlen; + uirec.br_startoff = irec.br_startoff + rlen; + uirec.br_blockcount = irec.br_blockcount - rlen; + irec.br_blockcount = rlen; + trace_xfs_reflink_cow_remap_piece(ip, &uirec); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, + &dfops, uirec.br_startblock, + uirec.br_blockcount); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &uirec); + if (error) + goto out_defer; + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + +next_extent: + /* Roll on... */ + offset_fsb = irec.br_startoff + ilen; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Free leftover CoW reservations that didn't get cleaned out. + */ +int +xfs_reflink_recover_cow( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int error = 0; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_refcount_recover_cow_leftovers(mp, agno); + if (error) + break; + } + + return error; +} + +/* + * Reflinking (Block) Ranges of Two Files Together + * + * First, ensure that the reflink flag is set on both inodes. The flag is an + * optimization to avoid unnecessary refcount btree lookups in the write path. + * + * Now we can iteratively remap the range of extents (and holes) in src to the + * corresponding ranges in dest. Let drange and srange denote the ranges of + * logical blocks in dest and src touched by the reflink operation. + * + * While the length of drange is greater than zero, + * - Read src's bmbt at the start of srange ("imap") + * - If imap doesn't exist, make imap appear to start at the end of srange + * with zero length. + * - If imap starts before srange, advance imap to start at srange. + * - If imap goes beyond srange, truncate imap to end at the end of srange. + * - Punch (imap start - srange start + imap len) blocks from dest at + * offset (drange start). + * - If imap points to a real range of pblks, + * > Increase the refcount of the imap's pblks + * > Map imap's pblks into dest at the offset + * (drange start + imap start - srange start) + * - Advance drange and srange by (imap start - srange start + imap len) + * + * Finally, if the reflink made dest longer, update both the in-core and + * on-disk file sizes. + * + * ASCII Art Demonstration: + * + * Let's say we want to reflink this source file: + * + * ----SSSSSSS-SSSSS----SSSSSS (src file) + * <--------------------> + * + * into this destination file: + * + * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) + * <--------------------> + * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. + * Observe that the range has different logical offsets in either file. + * + * Consider that the first extent in the source file doesn't line up with our + * reflink range. Unmapping and remapping are separate operations, so we can + * unmap more blocks from the destination file than we remap. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD---------DDDDD--DDD + * <-------> + * + * Now remap the source extent into the destination file: + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD--SSSSSSSDDDDD--DDD + * <-------> + * + * Do likewise with the second hole and extent in our range. Holes in the + * unmap range don't affect our operation. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <----> + * --DDDDD--SSSSSSS-SSSSS-DDD + * <----> + * + * Finally, unmap and remap part of the third extent. This will increase the + * size of the destination file. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-----> + * --DDDDD--SSSSSSS-SSSSS----SSS + * <-----> + * + * Once we update the destination file's i_size, we're done. + */ + +/* + * Ensure the reflink bit is set in both inodes. + */ +STATIC int +xfs_reflink_set_inode_flag( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + struct xfs_mount *mp = src->i_mount; + int error; + struct xfs_trans *tp; + + if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) + xfs_ilock(src, XFS_ILOCK_EXCL); + else + xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + + if (!xfs_is_reflink_inode(src)) { + trace_xfs_reflink_set_inode_flag(src); + xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); + src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); + xfs_ifork_init_cow(src); + } else + xfs_iunlock(src, XFS_ILOCK_EXCL); + + if (src->i_ino == dest->i_ino) + goto commit_flags; + + if (!xfs_is_reflink_inode(dest)) { + trace_xfs_reflink_set_inode_flag(dest); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + xfs_ifork_init_cow(dest); + } else + xfs_iunlock(dest, XFS_ILOCK_EXCL); + +commit_flags: + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); + return error; +} + +/* + * Update destination inode size & cowextsize hint, if necessary. + */ +STATIC int +xfs_reflink_update_dest( + struct xfs_inode *dest, + xfs_off_t newlen, + xfs_extlen_t cowextsize) +{ + struct xfs_mount *mp = dest->i_mount; + struct xfs_trans *tp; + int error; + + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + if (newlen > i_size_read(VFS_I(dest))) { + trace_xfs_reflink_update_inode_size(dest, newlen); + i_size_write(VFS_I(dest), newlen); + dest->i_d.di_size = newlen; + } + + if (cowextsize) { + dest->i_d.di_cowextsize = cowextsize; + dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + } + + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); + return error; +} + +/* + * Do we have enough reserve in this AG to handle a reflink? The refcount + * btree already reserved all the space it needs, but the rmap btree can grow + * infinitely, so we won't allow more reflinks when the AG is down to the + * btree reserves. + */ +static int +xfs_reflink_ag_has_free_space( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int error = 0; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + pag = xfs_perag_get(mp, agno); + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) + error = -ENOSPC; + xfs_perag_put(pag); + return error; +} + +/* + * Unmap a range of blocks from a file, then map other blocks into the hole. + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). + * The extent irec is mapped into dest at irec->br_startoff. + */ +STATIC int +xfs_reflink_remap_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + xfs_fileoff_t destoff, + xfs_off_t new_isize) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_fsblock_t firstfsb; + unsigned int resblks; + struct xfs_defer_ops dfops; + struct xfs_bmbt_irec uirec; + bool real_extent; + xfs_filblks_t rlen; + xfs_filblks_t unmap_len; + xfs_off_t newlen; + int error; + + unmap_len = irec->br_startoff + irec->br_blockcount - destoff; + trace_xfs_reflink_punch_range(ip, destoff, unmap_len); + + /* Only remap normal extents. */ + real_extent = (irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(irec)); + + /* No reflinking if we're low on space */ + if (real_extent) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + if (error) + goto out; + } + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* If we're not just clearing space, then do we have enough quota? */ + if (real_extent) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + } + + trace_xfs_reflink_remap(ip, irec->br_startoff, + irec->br_blockcount, irec->br_startblock); + + /* Unmap the old blocks in the data fork. */ + rlen = unmap_len; + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec->br_startblock + rlen; + uirec.br_startoff = irec->br_startoff + rlen; + uirec.br_blockcount = unmap_len - rlen; + unmap_len = rlen; + + /* If this isn't a real mapping, we're done. */ + if (!real_extent || uirec.br_blockcount == 0) + goto next_extent; + + trace_xfs_reflink_remap(ip, uirec.br_startoff, + uirec.br_blockcount, uirec.br_startblock); + + /* Update the refcount tree */ + error = xfs_refcount_increase_extent(mp, &dfops, &uirec); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); + if (error) + goto out_defer; + + /* Update quota accounting. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + uirec.br_blockcount); + + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, + uirec.br_startoff + uirec.br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + +next_extent: + /* Process all the deferred stuff. */ + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + return error; +} + +/* + * Iteratively remap one file's extents (and holes) to another's. + */ +STATIC int +xfs_reflink_remap_blocks( + struct xfs_inode *src, + xfs_fileoff_t srcoff, + struct xfs_inode *dest, + xfs_fileoff_t destoff, + xfs_filblks_t len, + xfs_off_t new_isize) +{ + struct xfs_bmbt_irec imap; + int nimaps; + int error = 0; + xfs_filblks_t range_len; + + /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ + while (len) { + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, + dest, destoff); + /* Read extent from the source file */ + nimaps = 1; + xfs_ilock(src, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); + xfs_iunlock(src, XFS_ILOCK_EXCL); + if (error) + goto err; + ASSERT(nimaps == 1); + + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + &imap); + + /* Translate imap into the destination file. */ + range_len = imap.br_startoff + imap.br_blockcount - srcoff; + imap.br_startoff += destoff - srcoff; + + /* Clear dest from destoff to the end of imap and map it in. */ + error = xfs_reflink_remap_extent(dest, &imap, destoff, + new_isize); + if (error) + goto err; + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto err; + } + + /* Advance drange/srange */ + srcoff += range_len; + destoff += range_len; + len -= range_len; + } + + return 0; + +err: + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + return error; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page * +xfs_get_page( + struct inode *inode, + xfs_off_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int +xfs_compare_extents( + struct inode *src, + xfs_off_t srcoff, + struct inode *dest, + xfs_off_t destoff, + xfs_off_t len, + bool *is_same) +{ + xfs_off_t src_poff; + xfs_off_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + xfs_off_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + ASSERT(cmp_len > 0); + + trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, + XFS_I(dest), destoff); + + src_page = xfs_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = xfs_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); + return error; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct xfs_inode *src, + xfs_off_t srcoff, + struct xfs_inode *dest, + xfs_off_t destoff, + xfs_off_t len, + unsigned int flags) +{ + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + int error; + xfs_extlen_t cowextsize; + bool is_same; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Don't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + return -EINVAL; + + if (flags & ~XFS_REFLINK_ALL) + return -EINVAL; + + trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* + * Check that the extents are the same. + */ + if (flags & XFS_REFLINK_DEDUPE) { + is_same = false; + error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), + destoff, len, &is_same); + if (error) + goto out_error; + if (!is_same) { + error = -EBADE; + goto out_error; + } + } + + error = xfs_reflink_set_inode_flag(src, dest); + if (error) + goto out_error; + + /* + * Invalidate the page cache so that we can clear any CoW mappings + * in the destination file. + */ + truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, + PAGE_ALIGN(destoff + len) - 1); + + dfsbno = XFS_B_TO_FSBT(mp, destoff); + sfsbno = XFS_B_TO_FSBT(mp, srcoff); + fsblen = XFS_B_TO_FSB(mp, len); + error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + destoff + len); + if (error) + goto out_error; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (srcoff == 0 && len == i_size_read(VFS_I(src)) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + destoff == 0 && len >= i_size_read(VFS_I(dest)) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); + if (error) + goto out_error; + +out_error: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + if (error) + trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); + return error; +} + +/* + * The user wants to preemptively CoW all shared blocks in this file, + * which enables us to turn off the reflink flag. Iterate all + * extents which are not prealloc/delalloc to see which ranges are + * mentioned in the refcount tree, then read those blocks into the + * pagecache, dirty them, fsync them back out, and then we can update + * the inode flag. What happens if we run out of memory? :) + */ +STATIC int +xfs_reflink_dirty_extents( + struct xfs_inode *ip, + xfs_fileoff_t fbno, + xfs_filblks_t end, + xfs_off_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_off_t fpos; + xfs_off_t flen; + struct xfs_bmbt_irec map[2]; + int nmaps; + int error = 0; + + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); + if (error) + goto out; + if (nmaps == 0) + break; + if (map[0].br_startblock == HOLESTARTBLOCK || + map[0].br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map[0])) + goto next; + + map[1] = map[0]; + while (map[1].br_blockcount) { + agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); + aglen = map[1].br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, true); + if (error) + goto out; + if (rbno == NULLAGBLOCK) + break; + + /* Dirty the pages */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + + (rbno - agbno)); + flen = XFS_FSB_TO_B(mp, rlen); + if (fpos + flen > isize) + flen = isize - fpos; + error = iomap_file_dirty(VFS_I(ip), fpos, flen, + &xfs_iomap_ops); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + + map[1].br_blockcount -= (rbno - agbno + rlen); + map[1].br_startoff += (rbno - agbno + rlen); + map[1].br_startblock += (rbno - agbno + rlen); + } + +next: + fbno = map[0].br_startoff + map[0].br_blockcount; + } +out: + return error; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + struct xfs_bmbt_irec map; + int nmaps; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + fbno = 0; + end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + if (error) + return error; + if (nmaps == 0) + break; + if (map.br_startblock == HOLESTARTBLOCK || + map.br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map)) + goto next; + + agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); + aglen = map.br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, false); + if (error) + return error; + /* Is there still a shared block here? */ + if (rbno != NULLAGBLOCK) + return 0; +next: + fbno = map.br_startoff + map.br_blockcount; + } + + /* + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + if (error) + return error; + + /* Clear the inode flag. */ + trace_xfs_reflink_unset_inode_flag(ip); + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + xfs_trans_ijoin(*tpp, ip, 0); + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + return error; +} + +/* + * Clear the inode reflink flag if there are no shared extents and the size + * hasn't changed. + */ +STATIC int +xfs_reflink_try_clear_inode_flag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error = 0; + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_clear_inode_flag(ip, &tp); + if (error) + goto cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +cancel: + xfs_trans_cancel(tp); +out: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Pre-COW all shared blocks within a given byte range of a file and turn off + * the reflink flag if we unshare all of the file's blocks. + */ +int +xfs_reflink_unshare( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_off_t isize; + int error; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + trace_xfs_reflink_unshare(ip, offset, len); + + inode_dio_wait(VFS_I(ip)); + + /* Try to CoW the selected ranges */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + fbno = XFS_B_TO_FSBT(mp, offset); + isize = i_size_read(VFS_I(ip)); + end = XFS_B_TO_FSB(mp, offset + len); + error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* Wait for the IO to finish */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out; + + /* Turn off the reflink flag if possible. */ + error = xfs_reflink_try_clear_inode_flag(ip); + if (error) + goto out; + + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); + return error; +} + +/* + * Does this inode have any real CoW reservations? + */ +bool +xfs_reflink_has_real_cow_blocks( + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return false; + + /* Go find the old extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); + while (gotp) { + xfs_bmbt_get_all(gotp, &irec); + + if (!isnullstartblock(irec.br_startblock)) + return true; + + /* Roll on... */ + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + break; + gotp = xfs_iext_get_ext(ifp, idx); + } + + return false; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 000000000000..5dc3c8ac12aa --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + +extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, + struct xfs_bmbt_irec *imap, bool *need_alloc); +extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); + +extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, + struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb); +extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */ +#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE) +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, + struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, + unsigned int flags); +extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, + struct xfs_trans **tpp); +extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + +extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); + +#endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0432a459871c..73c827831551 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -441,8 +441,11 @@ xfs_rui_recover( XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_UNMAP_SHARED: case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_CONVERT_SHARED: case XFS_RMAP_EXTENT_ALLOC: case XFS_RMAP_EXTENT_FREE: op_ok = true; @@ -481,12 +484,21 @@ xfs_rui_recover( case XFS_RMAP_EXTENT_MAP: type = XFS_RMAP_MAP; break; + case XFS_RMAP_EXTENT_MAP_SHARED: + type = XFS_RMAP_MAP_SHARED; + break; case XFS_RMAP_EXTENT_UNMAP: type = XFS_RMAP_UNMAP; break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + type = XFS_RMAP_UNMAP_SHARED; + break; case XFS_RMAP_EXTENT_CONVERT: type = XFS_RMAP_CONVERT; break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + type = XFS_RMAP_CONVERT_SHARED; + break; case XFS_RMAP_EXTENT_ALLOC: type = XFS_RMAP_ALLOC; break; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 6e812fe0fd43..12d48cd8f8a4 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, { "rmapbt", XFSSTAT_END_RMAP_V2 }, + { "refcntbt", XFSSTAT_END_REFCOUNT }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 657865f51e78..79ad2e69fc33 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -213,7 +213,23 @@ struct xfsstats { __uint32_t xs_rmap_2_alloc; __uint32_t xs_rmap_2_free; __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) + __uint32_t xs_refcbt_2_lookup; + __uint32_t xs_refcbt_2_compare; + __uint32_t xs_refcbt_2_insrec; + __uint32_t xs_refcbt_2_delrec; + __uint32_t xs_refcbt_2_newroot; + __uint32_t xs_refcbt_2_killroot; + __uint32_t xs_refcbt_2_increment; + __uint32_t xs_refcbt_2_decrement; + __uint32_t xs_refcbt_2_lshift; + __uint32_t xs_refcbt_2_rshift; + __uint32_t xs_refcbt_2_split; + __uint32_t xs_refcbt_2_join; + __uint32_t xs_refcbt_2_alloc; + __uint32_t xs_refcbt_2_free; + __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2d092f9577ca..ade4691e3f74 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,9 @@ #include "xfs_sysfs.h" #include "xfs_ondisk.h" #include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" +#include "xfs_reflink.h" #include <linux/namei.h> #include <linux/init.h> @@ -936,6 +939,7 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); + int error; trace_xfs_destroy_inode(ip); @@ -943,6 +947,14 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) + xfs_warn(ip->i_mount, +"Error %d while evicting CoW blocks for inode %llu.", + error, ip->i_ino); + } + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1006,6 +1018,16 @@ xfs_fs_drop_inode( { struct xfs_inode *ip = XFS_I(inode); + /* + * If this unlinked inode is in the middle of recovery, don't + * drop the inode just yet; log recovery will take care of + * that. See the comment for this inode flag. + */ + if (ip->i_flags & XFS_IRECOVERY) { + ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + return 0; + } + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } @@ -1296,10 +1318,31 @@ xfs_fs_remount( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_queue_eofblocks(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* * Before we sync the metadata, we need to free up the reserve * block pool so that the used block count in the superblock on @@ -1490,6 +1533,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; @@ -1572,6 +1616,9 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "DAX and reflink have not been tested together!"); } if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { @@ -1585,6 +1632,10 @@ xfs_fs_fill_super( "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1788,8 +1839,38 @@ xfs_init_zones(void) if (!xfs_rui_zone) goto out_destroy_rud_zone; + xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), + "xfs_cud_item"); + if (!xfs_cud_zone) + goto out_destroy_rui_zone; + + xfs_cui_zone = kmem_zone_init( + xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), + "xfs_cui_item"); + if (!xfs_cui_zone) + goto out_destroy_cud_zone; + + xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), + "xfs_bud_item"); + if (!xfs_bud_zone) + goto out_destroy_cui_zone; + + xfs_bui_zone = kmem_zone_init( + xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), + "xfs_bui_item"); + if (!xfs_bui_zone) + goto out_destroy_bud_zone; + return 0; + out_destroy_bud_zone: + kmem_zone_destroy(xfs_bud_zone); + out_destroy_cui_zone: + kmem_zone_destroy(xfs_cui_zone); + out_destroy_cud_zone: + kmem_zone_destroy(xfs_cud_zone); + out_destroy_rui_zone: + kmem_zone_destroy(xfs_rui_zone); out_destroy_rud_zone: kmem_zone_destroy(xfs_rud_zone); out_destroy_icreate_zone: @@ -1832,6 +1913,10 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_bui_zone); + kmem_zone_destroy(xfs_bud_zone); + kmem_zone_destroy(xfs_cui_zone); + kmem_zone_destroy(xfs_cud_zone); kmem_zone_destroy(xfs_rui_zone); kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); @@ -1885,6 +1970,8 @@ init_xfs_fs(void) xfs_extent_free_init_defer_op(); xfs_rmap_update_init_defer_op(); + xfs_refcount_update_init_defer_op(); + xfs_bmap_update_init_defer_op(); xfs_dir_startup(); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index aed74d3f8da9..afe1f66aaa69 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.eofb_timer.min, .extra2 = &xfs_params.eofb_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.cowb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.cowb_timer.min, + .extra2 = &xfs_params.cowb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ffef45375754..984a3499cfe3 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -48,6 +48,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ + xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16093c7dacde..ad188d3a83f3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -39,6 +39,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_refcount_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? - ip->i_afp : &ip->i_df; + struct xfs_ifork *ifp; struct xfs_bmbt_irec r; + ifp = xfs_iext_state_to_fork(ip, state); xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; @@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); @@ -2581,10 +2587,20 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete); DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); + +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range); DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* deferred bmbt updates */ +#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); + /* per-AG reservation */ DECLARE_EVENT_CLASS(xfs_ag_resv_class, TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, @@ -2639,6 +2655,728 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); +/* refcount tracepoint classes */ + +/* reuse the discard trace class for agbno/aglen-based traces */ +#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) + +/* ag btree lookup tracepoint class */ +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQ, "eq" }, \ + { XFS_LOOKUP_LE, "le" }, \ + { XFS_LOOKUP_GE, "ge" } +DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_lookup_t dir), + TP_ARGS(mp, agno, agbno, dir), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_lookup_t, dir) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->dir = dir; + ), + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), + __entry->dir) +) + +#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ +DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_lookup_t dir), \ + TP_ARGS(mp, agno, agbno, dir)) + +/* single-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec), \ + TP_ARGS(mp, agno, irec)) + +/* single-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), + TP_ARGS(mp, agno, irec, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, irec, agbno)) + +/* double-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), + TP_ARGS(mp, agno, i1, i2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ + TP_ARGS(mp, agno, i1, i2)) + +/* double-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + xfs_agblock_t agbno), + TP_ARGS(mp, agno, i1, i2, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, i1, i2, agbno)) + +/* triple-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + struct xfs_refcount_irec *i3), + TP_ARGS(mp, agno, i1, i2, i3), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, i3_startblock) + __field(xfs_extlen_t, i3_blockcount) + __field(xfs_nlink_t, i3_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->i3_startblock = i3->rc_startblock; + __entry->i3_blockcount = i3->rc_blockcount; + __entry->i3_refcount = i3->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->i3_startblock, + __entry->i3_blockcount, + __entry->i3_refcount) +); + +#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + struct xfs_refcount_irec *i3), \ + TP_ARGS(mp, agno, i1, i2, i3)) + +/* refcount btree tracepoints */ +DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); +DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); +DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); + +/* refcount adjustment tracepoints */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); +DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); + +/* reflink helpers */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + +TRACE_EVENT(xfs_refcount_finish_one_leftover, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t new_agbno, xfs_extlen_t new_len), + TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, new_agbno) + __field(xfs_extlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + __entry->new_agbno = new_agbno; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len, + __entry->new_agbno, + __entry->new_len) +); + +/* simple inode-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_inode_error_class, + TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip), + TP_ARGS(ip, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino %llx error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_INODE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_inode_error_class, name, \ + TP_PROTO(struct xfs_inode *ip, int error, \ + unsigned long caller_ip), \ + TP_ARGS(ip, error, caller_ip)) + +/* reflink allocator */ +TRACE_EVENT(xfs_bmap_remap_alloc, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, + xfs_extlen_t len), + TP_ARGS(ip, fsbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, fsbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->fsbno = fsbno; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->fsbno, + __entry->len) +); +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); + +/* reflink tracepoint classes */ + +/* two-file io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_io_class, + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, + struct xfs_inode *dest, xfs_off_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_disize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(xfs_ino_t, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_disize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = VFS_I(src)->i_size; + __entry->src_disize = src->i_d.di_size; + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = VFS_I(dest)->i_size; + __entry->dest_disize = dest->i_d.di_size; + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_disize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_disize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_io_class, name, \ + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \ + struct xfs_inode *dest, xfs_off_t doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* two-file vfs io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, + TP_PROTO(struct inode *src, u64 soffset, u64 len, + struct inode *dest, u64 doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%lx isize 0x%llx offset 0x%llx -> " + "ino 0x%lx isize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_vfs_io_class, name, \ + TP_PROTO(struct inode *src, u64 soffset, u64 len, \ + struct inode *dest, u64 doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* CoW write tracepoint */ +DECLARE_EVENT_CLASS(xfs_copy_on_write_class, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, pblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_fsblock_t, pblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " + "len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->pblk, + __entry->len, + __entry->new_pblk) +) + +#define DEFINE_COW_EVENT(name) \ +DEFINE_EVENT(xfs_copy_on_write_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ + xfs_extlen_t len, xfs_fsblock_t new_pblk), \ + TP_ARGS(ip, lblk, pblk, len, new_pblk)) + +/* inode/irec events */ +DECLARE_EVENT_CLASS(xfs_inode_irec_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->pblk) +); +#define DEFINE_INODE_IREC_EVENT(name) \ +DEFINE_EVENT(xfs_inode_irec_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, irec)) + +/* refcount/reflink tracepoint definitions */ + +/* reflink tracepoints */ +DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); +DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); +DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +TRACE_EVENT(xfs_reflink_remap_blocks_loop, + TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, + xfs_filblks_t len, struct xfs_inode *dest, + xfs_fileoff_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(xfs_fileoff_t, src_lblk) + __field(xfs_filblks_t, len) + __field(xfs_ino_t, dest_ino) + __field(xfs_fileoff_t, dest_lblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_lblk = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_lblk = doffset; + ), + TP_printk("dev %d:%d len 0x%llx " + "ino 0x%llx offset 0x%llx blocks -> " + "ino 0x%llx offset 0x%llx blocks", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_lblk, + __entry->dest_ino, + __entry->dest_lblk) +); +TRACE_EVENT(xfs_reflink_punch_range, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len), + TP_ARGS(ip, lblk, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len) +); +TRACE_EVENT(xfs_reflink_remap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->new_pblk) +); +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); + +/* dedupe tracepoints */ +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); + +/* ioctl tracepoints */ +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); +TRACE_EVENT(xfs_ioctl_clone, + TP_PROTO(struct inode *src, struct inode *dest), + TP_ARGS(src, dest), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + ), + TP_printk("dev %d:%d " + "ino 0x%lx isize 0x%llx -> " + "ino 0x%lx isize 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_ino, + __entry->src_isize, + __entry->dest_ino, + __entry->dest_isize) +); + +/* unshare tracepoints */ +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); +DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); + +/* copy on write */ +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); + +DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); + +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); + +DEFINE_COW_EVENT(xfs_reflink_fork_buf); +DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); + +DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); + +/* rmap swapext tracepoints */ +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); +DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2bf86aad33d..61b7fbdd3ebd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -36,6 +36,11 @@ struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; +struct xfs_cui_log_item; +struct xfs_cud_log_item; +struct xfs_defer_ops; +struct xfs_bui_log_item; +struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +/* refcount updates */ +enum xfs_refcount_intent_type; + +void xfs_refcount_update_init_defer_op(void); +struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, + struct xfs_cui_log_item *cuip); +int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, + xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + +/* mapping updates */ +enum xfs_bmap_intent_type; + +void xfs_bmap_update_init_defer_op(void); +struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, + struct xfs_bui_log_item *buip); +int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, + struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, struct xfs_inode *ip, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c new file mode 100644 index 000000000000..6408e7d7c08c --- /dev/null +++ b/fs/xfs/xfs_trans_bmap.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * This routine is called to allocate a "bmap update done" + * log item. + */ +struct xfs_bud_log_item * +xfs_trans_get_bud( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip) +{ + struct xfs_bud_log_item *budp; + + budp = xfs_bud_init(tp->t_mountp, buip); + xfs_trans_add_item(tp, &budp->bud_item); + return budp; +} + +/* + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. + */ +int +xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, + struct xfs_defer_ops *dop, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int error; + + error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff, + startblock, blockcount, state); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the BUI and frees the BUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_bmap_intent *ba; + struct xfs_bmap_intent *bb; + + ba = container_of(a, struct xfs_bmap_intent, bi_list); + bb = container_of(b, struct xfs_bmap_intent, bi_list); + return ba->bi_owner->i_ino - bb->bi_owner->i_ino; +} + +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_bui_log_item *buip; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + ASSERT(tp != NULL); + + buip = xfs_bui_init(tp->t_mountp); + ASSERT(buip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &buip->bui_item); + return buip; +} + +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( + struct xfs_map_extent *bmap, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + bmap->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + bmap->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) + bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; +} + +/* Log bmap updates in the intent item. */ +STATIC void +xfs_bmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_bui_log_item *buip = intent; + struct xfs_bmap_intent *bmap; + uint next_extent; + struct xfs_map_extent *map; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; + map->me_owner = bmap->bi_owner->i_ino; + map->me_startblock = bmap->bi_bmap.br_startblock; + map->me_startoff = bmap->bi_bmap.br_startoff; + map->me_len = bmap->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, + bmap->bi_bmap.br_state); +} + +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_bud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_bmap_intent *bmap; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, + bmap->bi_bmap.br_blockcount, + bmap->bi_bmap.br_state); + kmem_free(bmap); + return error; +} + +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( + void *intent) +{ + xfs_bui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bmap; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_free(bmap); +} + +static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_BMAP, + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .diff_items = xfs_bmap_update_diff_items, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .log_item = xfs_bmap_update_log_item, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_bmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_bmap_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c new file mode 100644 index 000000000000..94c1877af834 --- /dev/null +++ b/fs/xfs/xfs_trans_refcount.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_refcount_item.h" +#include "xfs_alloc.h" +#include "xfs_refcount.h" + +/* + * This routine is called to allocate a "refcount update done" + * log item. + */ +struct xfs_cud_log_item * +xfs_trans_get_cud( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip) +{ + struct xfs_cud_log_item *cudp; + + cudp = xfs_cud_init(tp->t_mountp, cuip); + xfs_trans_add_item(tp, &cudp->cud_item); + return cudp; +} + +/* + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. + */ +int +xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, + struct xfs_defer_ops *dop, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_refcount_finish_one(tp, dop, type, startblock, + blockcount, new_fsb, new_len, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the CUI and frees the CUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_refcount_intent *ra; + struct xfs_refcount_intent *rb; + + ra = container_of(a, struct xfs_refcount_intent, ri_list); + rb = container_of(b, struct xfs_refcount_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} + +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_cui_log_item *cuip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + cuip = xfs_cui_init(tp->t_mountp, count); + ASSERT(cuip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &cuip->cui_item); + return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( + struct xfs_phys_extent *refc, + enum xfs_refcount_intent_type type) +{ + refc->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + refc->pe_flags |= type; + break; + default: + ASSERT(0); + } +} + +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_cui_log_item *cuip = intent; + struct xfs_refcount_intent *refc; + uint next_extent; + struct xfs_phys_extent *ext; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); + ext = &cuip->cui_format.cui_extents[next_extent]; + ext->pe_startblock = refc->ri_startblock; + ext->pe_len = refc->ri_blockcount; + xfs_trans_set_refcount_flags(ext, refc->ri_type); +} + +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_cud(tp, intent); +} + +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_refcount_intent *refc; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_aglen; + int error; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, done_item, dop, + refc->ri_type, + refc->ri_startblock, + refc->ri_blockcount, + &new_fsb, &new_aglen, + (struct xfs_btree_cur **)state); + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && new_aglen > 0) { + ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || + refc->ri_type == XFS_REFCOUNT_DECREASE); + refc->ri_startblock = new_fsb; + refc->ri_blockcount = new_aglen; + return -EAGAIN; + } + kmem_free(refc); + return error; +} + +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_refcount_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( + void *intent) +{ + xfs_cui_release(intent); +} + +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *refc; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_free(refc); +} + +static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_REFCOUNT, + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .diff_items = xfs_refcount_update_diff_items, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .log_item = xfs_refcount_update_log_item, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_update_finish_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_refcount_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_refcount_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 5a50ef881568..9ead064b5e90 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags( case XFS_RMAP_MAP: rmap->me_flags |= XFS_RMAP_EXTENT_MAP; break; + case XFS_RMAP_MAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; case XFS_RMAP_UNMAP: rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; + case XFS_RMAP_UNMAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; case XFS_RMAP_CONVERT: rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; + case XFS_RMAP_CONVERT_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; case XFS_RMAP_ALLOC: rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; |