diff options
Diffstat (limited to 'fs')
71 files changed, 1440 insertions, 752 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index de009a33e0e2..f84412290a30 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -131,10 +131,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); - } else { - if (dentry->d_inode) - ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); } + if (!ret && dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); return ret; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index fd72fc38c8f5..effb3aa1f3ed 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -295,6 +295,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode->i_op = &v9fs_file_inode_operations; inode->i_fop = &v9fs_file_operations; } + mapping_set_large_folios(inode->i_mapping); break; case S_IFLNK: diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4e4a448f6931..6e161f8ffe8d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c) continue; } + if (k.k->p.offset < ca->mi.first_bucket) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + continue; + } + + if (k.k->p.offset >= ca->mi.nbuckets) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 84832c2d4df9..5004f6ba997c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -678,7 +678,8 @@ struct bch_sb_field_ext { x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 771154e3a291..94bbd8505582 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_gens_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->state_lock); + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1c1448b52207..cf933409d385 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - __btree_node_write_done(c, b); - six_unlock_read(&b->c.lock); + /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index bfe9f0c1e1be..0883cf6e1a3e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : + bkey_gt(iter_pos, end))) goto end; break; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 78e63ad7d380..31a58bf46fdb 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 1e694fedc5da..a7aedb134e9f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) return; + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 462b1a2fe1ad..a6ee0beee6b0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc if (ptr2 == ptr) break; + ca = bch2_dev_have_ref(c, ptr2->dev); bucket = PTR_BUCKET_POS(ca, ptr2); bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); } diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9f3133e3e7e5..e309fb78529b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k) *p = swab64(*p); } +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos acc) +{ + unsafe_memcpy(r, &acc.replicas, + replicas_entry_bytes(&acc.replicas), + "variable length struct"); +} + static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; @@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - unsafe_memcpy(r, &acc_k.replicas, - replicas_entry_bytes(&acc_k.replicas), - "variable length struct"); + __accounting_to_replicas(r, acc_k); return true; default: return false; @@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return ret; } +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + /* * At startup time, initialize the in memory accounting from the btree (and * journal) @@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c) } keys->gap = keys->nr = dst - keys->data; - percpu_down_read(&c->mark_lock); - for (unsigned i = 0; i < acc->k.nr; i++) { - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + percpu_down_write(&c->mark_lock); + unsigned i = 0; + while (i < acc->k.nr) { + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) - continue; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); - struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) - continue; + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* - * If the replicas entry is invalid it'll get cleaned up by - * check_allocations: + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: */ - if (bch2_replicas_entry_validate(&r.e, c, &buf)) + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, + v, acc->k.data[idx].nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(acc->k.data[idx].v[0]); + free_percpu(acc->k.data[idx].v[1]); + darray_remove_item(&acc->k, &acc->k.data[idx]); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + ret = 0; continue; - - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &k), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_read(&c->mark_lock); } + + if (ret) + goto fsck_err; + i++; } preempt_disable(); @@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c) } preempt_enable(); fsck_err: - percpu_up_read(&c->mark_lock); + percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1587c6e1866a..e410cfe37b1a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); + bkey_fsck_err_on(s->csum_granularity_bits >= 64, + c, stripe_csum_granularity_bad, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; @@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, nr_data, s.nr_redundant); bch2_prt_csum_type(out, s.csum_type); - prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + prt_str(out, " gran "); + if (s.csum_granularity_bits < 64) + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); + else + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); if (s.disk_label) { prt_str(out, " label"); @@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c) /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - bool create) + struct bkey_i_stripe *old, + struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; + bool create = !old; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type]); + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; - unsigned i; + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - if (old->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); + BUG_ON(old->v.nr_blocks != v->nr_blocks); - for (i = 0; i < new->v.nr_blocks; i++) { - unsigned v = stripe_blockcount_get(old, i); + for (unsigned i = 0; i < new->v.nr_blocks; i++) { + unsigned sectors = stripe_blockcount_get(v, i); - BUG_ON(v && - (old->ptrs[i].dev != new->v.ptrs[i].dev || - old->ptrs[i].gen != new->v.ptrs[i].gen || - old->ptrs[i].offset != new->v.ptrs[i].offset)); + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { + struct printbuf buf = PRINTBUF; - stripe_blockcount_set(&new->v, i, v); + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + /* + * If the stripe ptr changed underneath us, it must have + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() + */ + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); + + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; + } + + stripe_blockcount_set(&new->v, i, sectors); } } @@ -1499,8 +1523,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, - bkey_i_to_stripe(&s->new_stripe.key), - !s->have_existing_stripe)); + s->have_existing_stripe + ? bkey_i_to_stripe(&s->existing_stripe.key) + : NULL, + bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; @@ -1876,7 +1902,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { - __clear_bit(v->ptrs[i].dev, devs.d); + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to + * walk backpointers and update all extents that point to that + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) nr_have_data++; else diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 60b7875adada..649263516ab1 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -268,7 +268,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ed5001dd662e..923a5f1849a8 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +{ + return (ptr1.cached == ptr2.cached && + ptr1.unwritten == ptr2.unwritten && + ptr1.offset == ptr2.offset && + ptr1.dev == ptr2.dev && + ptr1.dev == ptr2.dev); +} + void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index ee1c0325f313..6d3a05ae5da8 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) static __always_inline long bch2_dio_write_done(struct dio_write *dio) { + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; bool sync = dio->sync; @@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 02969dff165d..d7bd28bdaffc 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +{ + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return false; + + darray_init(&subvols); +restart_from_top: + + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +err: + darray_exit(&subvols); + return ret; +} + +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } @@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c, subvol_inum inum) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wait; + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); @@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index da74ecc236e7..59f9f7ae728d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) return inode->ei_inum; } -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); - /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -148,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -198,10 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return NULL; -} +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8a6ceb0cc7a..a1087fd292e4 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -326,17 +326,54 @@ err: return ret; } +static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) +{ + if (inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) + return false; + + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); +} + +static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bpos_eq(k.k->p, d_pos)) { + /* + * delet_at() doesn't work because the update path doesn't + * internally use BTREE_ITER_with_updates yet + */ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct bch_fs *c = trans->c; - struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; - u32 dirent_snapshot = inode->bi_snapshot; int ret; + u32 dirent_snapshot = inode->bi_snapshot; if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; @@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; - dir_hash = bch2_hash_info_init(c, &lostfound); + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); + struct qstr name = (struct qstr) QSTR(name_buf); - name = (struct qstr) QSTR(name_buf); + inode->bi_dir = lostfound.bi_inum; ret = bch2_dirent_create_snapshot(trans, inode->bi_parent_subvol, lostfound.bi_inum, @@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * inode_d_type(inode), &name, inode->bi_subvol ?: inode->bi_inum, - &dir_offset, + &inode->bi_dir_offset, STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; + ret = __bch2_fsck_write_inode(trans, inode); + if (ret) + return ret; + + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { + snapshot_id_list whiteouts_done; + struct btree_iter iter; + struct bkey_s_c k; + + darray_init(&whiteouts_done); + + for_each_btree_key_reverse_norestart(trans, iter, + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bkey_is_inode(k.k) || + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) + continue; + + struct bch_inode_unpacked child_inode; + bch2_inode_unpack(k, &child_inode); + + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot); + if (ret) + break; + + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) + break; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; + child_inode.bi_dir_offset = inode->bi_dir_offset; + + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + break; + } + } + darray_exit(&whiteouts_done); + bch2_trans_iter_exit(trans, &iter); + } - return __bch2_fsck_write_inode(trans, inode); + return ret; } static int remove_backpointer(struct btree_trans *trans, @@ -994,7 +1085,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, */ inode->bi_dir = 0; inode->bi_dir_offset = 0; - inode->bi_flags &= ~BCH_INODE_backptr_untrusted; *write_inode = true; } @@ -1006,28 +1096,11 @@ fsck_err: return ret; } -static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) -{ - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; - - /* snapshot tree corruption, can't safely delete */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; - } - - return __bch2_inode_hash_find(c, inum) != NULL; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bch_inode_unpacked *prev, - struct snapshots_seen *s, - bool full) + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -1050,12 +1123,6 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); - if (!full && - !(u.bi_flags & (BCH_INODE_i_size_dirty| - BCH_INODE_i_sectors_dirty| - BCH_INODE_unlinked))) - return 0; - if (prev->bi_inum != u.bi_inum) *prev = u; @@ -1101,28 +1168,27 @@ static int check_inode(struct btree_trans *trans, ret = 0; } - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); - if (ret) - goto err; - - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, &u); + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; - bch_err_msg(c, ret, "in fsck updating inode"); + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), + trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be %u)\n%s", + ret, + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { if (ret) - goto err_noprint; - - if (!bpos_eq(new_min_pos, POS_MIN)) - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); - goto err_noprint; + u.bi_flags |= BCH_INODE_has_child_snapshot; + else + u.bi_flags &= ~BCH_INODE_has_child_snapshot; + do_update = true; } + ret = 0; - if (u.bi_flags & BCH_INODE_unlinked) { + if ((u.bi_flags & BCH_INODE_unlinked) && + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { if (!test_bit(BCH_FS_started, &c->flags)) { /* * If we're not in online fsck, don't delete unlinked @@ -1147,7 +1213,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1155,69 +1225,10 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } - /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ - if (u.bi_flags & BCH_INODE_i_size_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_size_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_size dirty", - u.bi_inum))) { - bch_verbose(c, "truncating inode %llu", u.bi_inum); - - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, - iter->pos.snapshot), - POS(u.bi_inum, U64_MAX), - 0, NULL); - bch_err_msg(c, ret, "in fsck truncating inode"); - if (ret) - return ret; - - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.bi_flags |= BCH_INODE_i_sectors_dirty; - - u.bi_flags &= ~BCH_INODE_i_size_dirty; - do_update = true; - } - - /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ - if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_sectors_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_sectors dirty", - u.bi_inum))) { - s64 sectors; - - bch_verbose(c, "recounting sectors for inode %llu", - u.bi_inum); - - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); - if (sectors < 0) { - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); - return sectors; - } - - u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; - do_update = true; - } - - if (u.bi_flags & BCH_INODE_backptr_untrusted) { - u.bi_dir = 0; - u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_backptr_untrusted; - do_update = true; - } - if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), @@ -1274,7 +1285,6 @@ err_noprint: int bch2_check_inodes(struct bch_fs *c) { - bool full = c->opts.fsck; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; @@ -1285,13 +1295,104 @@ int bch2_check_inodes(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &prev, &s, full))); + check_inode(trans, &iter, k, &prev, &s))); snapshots_seen_exit(&s); bch_err_fn(c, ret); return ret; } +static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + /* + * We look for inodes to reattach in natural key order, leaves first, + * but we should do the reattach at the oldest version that needs to be + * reattached: + */ + for_each_btree_key_norestart(trans, iter, + BTREE_ID_inodes, + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) + continue; + + if (!bkey_is_inode(k.k)) + break; + + struct bch_inode_unpacked parent_inode; + bch2_inode_unpack(k, &parent_inode); + + if (!inode_should_reattach(&parent_inode)) + break; + + *inode = parent_inode; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!bkey_is_inode(k.k)) + return 0; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + if (!inode_should_reattach(&inode)) + return 0; + + ret = find_oldest_inode_needs_reattach(trans, &inode); + if (ret) + return ret; + + if (fsck_err(trans, inode_unreachable, + "unreachable inode:\n%s", + (bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) + ret = reattach_inode(trans, &inode); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * Reattach unreachable (but not unlinked) inodes + * + * Run after check_inodes() and check_dirents(), so we node that inode + * backpointer fields point to valid dirents, and every inode that has a dirent + * that points to it has its backpointer field set - so we're just looking for + * non-unlinked inodes without backpointers: + * + * XXX: this is racy w.r.t. hardlink removal in online fsck + */ +int bch2_check_unreachable_inodes(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_unreachable_inode(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) { switch (btree) { @@ -1694,8 +1795,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", @@ -2450,22 +2550,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (ret) break; - /* - * We've checked that inode backpointers point to valid dirents; - * here, it's sufficient to check that the subvolume root has a - * dirent: - */ - if (fsck_err_on(!subvol_root.bi_dir, - trans, subvol_unreachable, - "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, &subvol_root), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { @@ -2526,12 +2610,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -/* - * Check that a given inode is reachable from its subvolume root - we already - * verified subvolume connectivity: - * - * XXX: we should also be verifying that inodes are in the right subvolumes - */ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; @@ -2545,6 +2623,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino BUG_ON(bch2_inode_unpack(inode_k, &inode)); + if (!S_ISDIR(inode.bi_mode)) + return 0; + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; @@ -2559,21 +2640,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); if (bch2_err_matches(ret, ENOENT)) { - ret = 0; - if (fsck_err(trans, inode_unreachable, - "unreachable inode\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, inode_k), - buf.buf))) - ret = reattach_inode(trans, &inode); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, inode_k); + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", + bch2_err_str(ret), buf.buf); goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode.bi_mode)) - break; - ret = darray_push(p, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, @@ -2626,9 +2701,8 @@ fsck_err: } /* - * Check for unreachable inodes, as well as loops in the directory structure: - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's - * unreachable: + * Check for loops in the directory structure: all other connectivity issues + * have been fixed by prior passes */ int bch2_check_directory_structure(struct bch_fs *c) { @@ -2756,6 +2830,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (S_ISDIR(u.bi_mode)) continue; + /* + * Previous passes ensured that bi_nlink is nonzero if + * it had multiple hardlinks: + */ if (!u.bi_nlink) continue; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index a4ef94271784..1cca31011530 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); int bch2_check_subvolume_structure(struct bch_fs *); +int bch2_check_unreachable_inodes(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3e5bc01961b8..344ccb7a824c 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "extent_update.h" +#include "fs.h" #include "inode.h" #include "str_hash.h" #include "snapshot.h" @@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = { }; #undef x +static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -575,9 +578,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) } } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); + return; + case KEY_TYPE_inode_v2: + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); + return; + case KEY_TYPE_inode_v3: + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); + return; + default: + BUG(); + } +} + +static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) +{ + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); +} + +static struct bkey_s_c +bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static struct bkey_s_c +bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos, unsigned flags) +{ + struct bkey_s_c k; +again: + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); + if (!k.k || + bkey_err(k) || + bkey_is_inode(k.k)) + return k; + + bch2_trans_iter_exit(trans, iter); + pos = k.k->p; + goto again; +} + +int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) { - return bkey_inode_flags(k) & BCH_INODE_unlinked; + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && + bkey_is_inode(k.k)) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int update_inode_has_children(struct btree_trans *trans, + struct bkey_s k, + bool have_child) +{ + if (!have_child) { + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) + return ret < 0 ? ret : 0; + } + + u64 f = bkey_inode_flags(k.s_c); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); + + return 0; +} + +static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, + bool have_child) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, + &iter, pos, BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (!have_child) { + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) { + ret = ret < 0 ? ret : 0; + goto err; + } + } + + u64 f = bkey_inode_flags(k); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } int bch2_trigger_inode(struct btree_trans *trans, @@ -586,6 +717,8 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -599,13 +732,41 @@ int bch2_trigger_inode(struct btree_trans *trans, return ret; } - int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - - (int) bkey_is_deleted_inode(old); - if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, deleted_delta > 0); - if (ret) - return ret; + if (flags & BTREE_TRIGGER_transactional) { + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - + (int) bkey_is_unlinked_inode(old); + if (unlinked_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, unlinked_delta > 0); + if (ret) + return ret; + } + + /* + * If we're creating or deleting an inode at this snapshot ID, + * and there might be an inode in a parent snapshot ID, we might + * need to set or clear the has_child_snapshot flag on the + * parent. + */ + int deleted_delta = (int) bkey_is_inode(new.k) - + (int) bkey_is_inode(old.k); + if (deleted_delta && + bch2_snapshot_parent(c, new.k->p.snapshot)) { + int ret = update_parent_inode_has_children(trans, new.k->p, + deleted_delta > 0); + if (ret) + return ret; + } + + /* + * When an inode is first updated in a new snapshot, we may need + * to clear has_child_snapshot + */ + if (deleted_delta > 0) { + int ret = update_inode_has_children(trans, new, false); + if (ret) + return ret; + } } return 0; @@ -888,6 +1049,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (ret) + goto err2; + + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +err2: bch2_trans_put(trans); return ret; } @@ -992,7 +1158,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; @@ -1055,6 +1221,45 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } +/* + * After deleting an inode, there may be versions in older snapshots that should + * also be deleted - if they're not referenced by sibling snapshots and not open + * in other subvolumes: + */ +static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; +next_parent: + ret = lockrestart_do(trans, + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); + if (ret || !k.k) + return ret; + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; + bch2_trans_iter_exit(trans, &iter); + + if (!unlinked) + return 0; + + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; + + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); + if (ret) + return ret; + goto next_parent; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); +} + static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos, @@ -1064,6 +1269,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); @@ -1099,6 +1305,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto out; + + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) { + inode.bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto out; + } + goto delete; + + } + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", @@ -1107,33 +1338,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); - if (ret) - goto out; - - inode.bi_flags &= ~BCH_INODE_unlinked; - - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_internal_snapshot_node); - bch_err_msg(c, ret, "clearing inode unlinked flag"); - if (ret) - goto out; - - /* - * We'll need another write buffer flush to pick up the new - * unlinked inodes in the snapshot leaves: - */ - *need_another_pass = true; - goto out; - } - ret = 1; out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); return ret; delete: ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 9c1f67705684..c8e98443e2d4 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -5,6 +5,7 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" +#include "snapshot.h" enum bch_validate_flags; extern const char * const bch2_inode_opts[]; @@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); + +static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 + ? __bch2_inode_has_child_snapshots(trans, pos) + : 0; +} + int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 83d107331edf..a204e46b6b47 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -133,7 +133,8 @@ enum inode_opt_id { x(i_size_dirty, 5) \ x(i_sectors_dirty, 6) \ x(unlinked, 7) \ - x(backptr_untrusted, 8) + x(backptr_untrusted, 8) \ + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f5f7db50ca31..dc099f06341f 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, { int ret; + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + HZ * 10)) + return ret; + + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", + buf.buf); + printbuf_exit(&buf); + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 232be8a44051..84097235eea9 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max - 1) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6db72d3bad7d..55e1504a8130 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -287,7 +287,8 @@ int bch2_journal_replay(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res, + BCH_TRANS_COMMIT_no_journal_res| + BCH_WATERMARK_reclaim, bch2_journal_replay_accounting_key(trans, k)); if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) goto err; diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 50406ce0e4ef..9d96c06e365c 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -46,6 +46,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index bcb3276747e0..477ef0997949 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_fs *c, struct printbuf *err) { - mutex_lock(&c->sb_lock); - int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); - mutex_unlock(&c->sb_lock); - return ret; + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate_locked(e, sb, err); + int ret = bch2_replicas_entry_sb_validate(e, sb, err); if (ret) return ret; @@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, rcu_read_lock(); for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } + nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 5102059a0f1d..ae715ff658e8 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -78,7 +78,10 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4135b1ea2fec..aab328ac6dfa 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,6 +180,7 @@ enum bch_fsck_flags { x(reflink_p_to_missing_reflink_v, 166, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ + x(stripe_csum_granularity_bad, 290, 0) \ x(stripe_sector_count_wrong, 169, 0) \ x(snapshot_tree_pos_bad, 170, 0) \ x(snapshot_tree_to_missing_snapshot, 171, 0) \ @@ -225,11 +226,13 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -289,6 +292,7 @@ enum bch_fsck_flags { x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ @@ -298,7 +302,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 287, 0) + x(MAX, 291, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 02bcde3c1b02..fb08dd680dac 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= 64) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out, prt_newline(out); prt_printf(out, "Btree allocated bitmap blocksize:\t"); - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); prt_printf(out, "Btree allocated bitmap:\t"); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 1809442b00ee..ae57638506c3 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - /* 0 is an invalid tree ID */ + /* Do we need to reconstruct the snapshot_tree entry as well? */ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; u32 tree_id = 0; - int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + if (ret) return ret; + if (!tree_id) { + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + } + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); ret = PTR_ERR_OR_ZERO(snapshot); if (ret) @@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) snapshot->v.tree = cpu_to_le32(tree_id); snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: @@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - - return s->children[1] ?: s->children[0]; -} - -static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -{ - u32 child; - - while ((child = bch2_snapshot_smallest_child(c, id))) - id = child; - return id; -} - -static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c interior_k, - u32 leaf_id, struct bpos *new_min_pos) -{ - struct btree_iter iter; - struct bpos pos = interior_k.k->p; - struct bkey_s_c k; - struct bkey_i *new; - int ret; - - pos.snapshot = leaf_id; - - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - /* key already overwritten in this snapshot? */ - if (k.k->p.snapshot != interior_k.k->p.snapshot) - goto out; - - if (bpos_eq(*new_min_pos, POS_MIN)) { - *new_min_pos = k.k->p; - new_min_pos->snapshot = leaf_id; - } - - new = bch2_bkey_make_mut_noupdate(trans, interior_k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - new->k.p.snapshot = leaf_id; - ret = bch2_trans_update(trans, &iter, new, 0); -out: - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c k, - struct bpos *new_min_pos) -{ - struct bch_fs *c = trans->c; - struct bkey_buf sk; - u32 restart_count = trans->restart_count; - int ret = 0; - - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - *new_min_pos = POS_MIN; - - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); - id < k.k->p.snapshot; - id++) { - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || - !bch2_snapshot_is_leaf(c, id)) - continue; -again: - ret = btree_trans_too_many_iters(trans) ?: - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto again; - } - - if (ret) - break; - } - - bch2_bkey_buf_exit(&sk, c); - - return ret ?: trans_was_restarted(trans, restart_count); -} - static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index eb5ef64221d6..29c94716293e 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, - struct bkey_s_c, struct bpos *); - int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 873e4be7e1dc..77d811a539af 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); +static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); @@ -620,9 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c) up_write(&c->state_lock); for_each_member_device(c, ca) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -1187,9 +1186,7 @@ static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1226,10 +1223,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); - if (ca->kobj.state_in_sysfs) { - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } + bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); @@ -1251,6 +1245,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } +static void bch2_dev_unlink(struct bch_dev *ca) +{ + struct kobject *b; + + /* + * This is racy w.r.t. the underlying block device being hot-removed, + * which removes it from sysfs. + * + * It'd be lovely if we had a way to handle this race, but the sysfs + * code doesn't appear to provide a good method and block/holder.c is + * susceptible as well: + */ + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev && + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { + sysfs_remove_link(b, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } +} + static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 32f719b9e661..115b90d29b1d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -849,6 +849,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; bool qrecord_inserted = false; @@ -859,11 +860,11 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); + xa_release(&delayed_refs->dirty_extents, + qrecord->bytenr >> fs_info->sectorsize_bits); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -873,7 +874,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -895,8 +896,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -1030,7 +1030,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, goto free_head_ref; } if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, - generic_ref->bytenr, GFP_NOFS)) { + generic_ref->bytenr >> fs_info->sectorsize_bits, + GFP_NOFS)) { ret = -ENOMEM; goto free_record; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 085f30968aba..352921e76c74 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,7 +202,15 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* Track dirty extent records. */ + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + */ struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c297909f1506..1332ec59c539 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2005,16 +2005,26 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *record) { struct btrfs_qgroup_extent_record *existing, *ret; - unsigned long bytenr = record->bytenr; + const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; +#if BITS_PER_LONG == 32 + if (record->bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + record->bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); xa_lock(&delayed_refs->dirty_extents); - existing = xa_load(&delayed_refs->dirty_extents, bytenr); + existing = xa_load(&delayed_refs->dirty_extents, index); if (existing) { if (record->data_rsv && !existing->data_rsv) { existing->data_rsv = record->data_rsv; @@ -2024,7 +2034,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 1; } - ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { qgroup_mark_inconsistent(fs_info); @@ -2129,6 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) @@ -2137,7 +2148,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } @@ -2152,7 +2163,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_unlock(&delayed_refs->lock); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, record->bytenr); + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 27306d98ec43..b068469871f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7190,13 +7190,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2ed2a791f8f..9637c7cdc0cf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1374,7 +1374,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 666873f745da..320d586c3896 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -191,10 +191,14 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(file)) return PTR_ERR(file); - dif->file = file; - if (!erofs_is_fileio_mode(sbi)) + if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); @@ -714,7 +718,10 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ERR(sbi->fdev)) return PTR_ERR(sbi->fdev); - return get_tree_nodev(fc, erofs_fc_fill_super); + if (S_ISREG(file_inode(sbi->fdev)->i_mode) && + sbi->fdev->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); + fput(sbi->fdev); } #endif return ret; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8936790618c6..a569ff9dfd04 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -710,24 +710,6 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) -{ - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; - } - - /* type 2, it belongs to an ongoing chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; -} - static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; @@ -803,7 +785,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); @@ -823,7 +804,15 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL, + fe->owned_head) == Z_EROFS_PCLUSTER_NIL) { + /* .. so it can be attached to our submission chain */ + fe->owned_head = &fe->pcl->next; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1253a8456e59..a076cca1f547 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -10,8 +10,6 @@ struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -33,14 +31,11 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, struct z_erofs_lcluster_index *di; unsigned int advise; - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - - m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr; + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; @@ -53,8 +48,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_LI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); @@ -110,9 +104,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; - int i; - u8 *in, type; bool big_pcluster; + u8 *in, type; + int i; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; @@ -121,6 +115,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); @@ -128,9 +126,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); - - in = m->kaddr - bytes; - + in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); @@ -255,10 +251,6 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ae54c4c72fe..321d8ffbab6e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4647,7 +4647,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_count(to), READ); /* In LFS mode, if there is inflight dio, wait for its completion */ - if (f2fs_lfs_mode(F2FS_I_SB(inode))) + if (f2fs_lfs_mode(F2FS_I_SB(inode)) && + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) inode_dio_wait(inode); if (f2fs_should_use_dio(inode, iocb, to)) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6423e1dedf14..15bf32c21ac0 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1037,7 +1037,7 @@ error_inode: if (corrupt < 0) { fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __func__, sinfo.i_pos); + __func__, new_i_pos); } goto out; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 78ebd265f425..aa587b2142e2 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode, } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch) { @@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t scan_end_byte = min(i_size_read(inode), end_byte); /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (start_byte == -ENXIO || start_byte == scan_end_byte) break; if (WARN_ON_ONCE(start_byte < 0)) - goto out_unlock; + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); if (WARN_ON_ONCE(data_end < 0)) - goto out_unlock; + return; /* * If we race with post-direct I/O invalidation of the page cache, @@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (punch_start_byte < end_byte) punch(inode, punch_start_byte, end_byte - punch_start_byte, iomap); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); -} - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, - loff_t pos, loff_t length, ssize_t written, unsigned flags, - struct iomap *iomap, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return; - - iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap, - punch); } -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index fe5b1a30c509..a8602729586a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -289,7 +289,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) * The folio is mapped and unlocked. When the caller is finished with * the entry, it should call folio_release_kmap(). * - * On failure, returns NULL and the caller should ignore foliop. + * On failure, returns an error pointer and the caller should ignore foliop. */ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, const struct qstr *qstr, struct folio **foliop) @@ -312,22 +312,24 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, do { char *kaddr = nilfs_get_folio(dir, n, foliop); - if (!IS_ERR(kaddr)) { - de = (struct nilfs_dir_entry *)kaddr; - kaddr += nilfs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - nilfs_error(dir->i_sb, - "zero-length directory entry"); - folio_release_kmap(*foliop, kaddr); - goto out; - } - if (nilfs_match(namelen, name, de)) - goto found; - de = nilfs_next_entry(de); + if (IS_ERR(kaddr)) + return ERR_CAST(kaddr); + + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, + "zero-length directory entry"); + folio_release_kmap(*foliop, kaddr); + goto out; } - folio_release_kmap(*foliop, kaddr); + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); } + folio_release_kmap(*foliop, kaddr); + if (++n >= npages) n = 0; /* next folio is past the blocks we've got */ @@ -340,7 +342,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, } } while (n != start); out: - return NULL; + return ERR_PTR(-ENOENT); found: ei->i_dir_start_lookup = n; @@ -384,18 +386,18 @@ fail: return NULL; } -ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino) { - ino_t res = 0; struct nilfs_dir_entry *de; struct folio *folio; de = nilfs_find_entry(dir, qstr, &folio); - if (de) { - res = le64_to_cpu(de->inode); - folio_release_kmap(folio, de); - } - return res; + if (IS_ERR(de)) + return PTR_ERR(de); + + *ino = le64_to_cpu(de->inode); + folio_release_kmap(folio, de); + return 0; } void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c950139db6ef..4905063790c5 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; ino_t ino; + int res; if (dentry->d_name.len > NILFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = nilfs_inode_by_name(dir, &dentry->d_name); - inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; + res = nilfs_inode_by_name(dir, &dentry->d_name, &ino); + if (res) { + if (res != -ENOENT) + return ERR_PTR(res); + inode = NULL; + } else { + inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + } + return d_splice_alias(inode, dentry); } @@ -263,10 +271,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) struct folio *folio; int err; - err = -ENOENT; de = nilfs_find_entry(dir, &dentry->d_name, &folio); - if (!de) + if (IS_ERR(de)) { + err = PTR_ERR(de); goto out; + } inode = d_inode(dentry); err = -EIO; @@ -362,10 +371,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (unlikely(err)) return err; - err = -ENOENT; old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio); - if (!old_de) + if (IS_ERR(old_de)) { + err = PTR_ERR(old_de); goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -382,10 +392,12 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (dir_de && !nilfs_empty_dir(new_inode)) goto out_dir; - err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio); - if (!new_de) + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, + &new_folio); + if (IS_ERR(new_de)) { + err = PTR_ERR(new_de); goto out_dir; + } nilfs_set_link(new_dir, new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); nilfs_mark_inode_dirty(new_dir); @@ -440,12 +452,13 @@ out: */ static struct dentry *nilfs_get_parent(struct dentry *child) { - unsigned long ino; + ino_t ino; + int res; struct nilfs_root *root; - ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); - if (!ino) - return ERR_PTR(-ENOENT); + res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino); + if (res) + return ERR_PTR(res); root = NILFS_I(d_inode(child))->i_root; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index fb1c4c5bae7c..45d03826eaf1 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -254,7 +254,7 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) /* dir.c */ int nilfs_add_link(struct dentry *, struct inode *); -ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino); int nilfs_make_empty(struct inode *, struct inode *); struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, struct folio **); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..e52bd96137a6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -909,8 +909,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. + * + * The length of the second argument of mnemonics[] + * needs to be 3 instead of previously set 2 + * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) + * to avoid spurious + * -Werror=unterminated-string-initialization warning + * with GCC 15 */ - static const char mnemonics[BITS_PER_LONG][2] = { + static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ @@ -987,11 +994,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { - seq_putc(m, mnemonics[i][0]); - seq_putc(m, mnemonics[i][1]); - seq_putc(m, ' '); - } + if (vma->vm_flags & (1UL << i)) + seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 68c716e6261b..1d3470bca45e 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -252,10 +252,6 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read); -extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, - struct page *page, - unsigned int page_offset, - unsigned int to_read); int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read); @@ -623,8 +619,6 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_ses *ses); bool is_server_using_iface(struct TCP_Server_Info *server, struct cifs_server_iface *iface); @@ -640,9 +634,6 @@ cifs_chan_set_in_reconnect(struct cifs_ses *ses, void cifs_chan_clear_in_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server); void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 63b5a55b7a57..766b4de13da7 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -166,7 +166,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t last, index = start / PAGE_SIZE; size_t len, off, foff; - ssize_t ret = 0; void *p; int s = 0; @@ -193,9 +192,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) memcpy(&sample[s], p, len2); kunmap_local(p); - if (ret < 0) - return ret; - s += len2; if (len2 < SZ_2K || s >= max - SZ_2K) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index adf8758847f6..15d94ac4095e 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -795,18 +795,6 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) } int -cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, - unsigned int page_offset, unsigned int to_read) -{ - struct msghdr smb_msg = {}; - struct bio_vec bv; - - bvec_set_page(&bv, page, to_read, page_offset); - iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); - return cifs_readv_from_socket(server, &smb_msg); -} - -int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read) { diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3216f786908f..c88e9657f47a 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -115,18 +115,6 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses, ses->chans[chan_index].in_reconnect = false; } -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server) -{ - unsigned int chan_index = cifs_ses_get_chan_index(ses, server); - - if (chan_index == CIFS_INVAL_CHAN_INDEX) - return true; /* err on the safer side */ - - return CIFS_CHAN_IN_RECONNECT(ses, chan_index); -} - void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) @@ -487,26 +475,6 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) spin_unlock(&ses->chan_lock); } -/* - * If server is a channel of ses, return the corresponding enclosing - * cifs_chan otherwise return NULL. - */ -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) -{ - int i; - - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server == server) { - spin_unlock(&ses->chan_lock); - return &ses->chans[i]; - } - } - spin_unlock(&ses->chan_lock); - return NULL; -} - static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6b385fce3f2a..24a2aa04a108 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1158,7 +1158,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; unsigned int size[1]; void *data[1]; - struct smb2_file_full_ea_info *ea = NULL; + struct smb2_file_full_ea_info *ea; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; int retries = 0, cur_sleep = 1; @@ -1179,6 +1179,7 @@ replay_again: if (!utf16_path) return -ENOMEM; + ea = NULL; resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; vars = kzalloc(sizeof(*vars), GFP_KERNEL); if (!vars) { diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index b2f16a7b696d..6584b5cddc28 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3313,6 +3313,15 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, return rc; if (indatalen) { + unsigned int len; + + if (WARN_ON_ONCE(smb3_encryption_required(tcon) && + (check_add_overflow(total_len - 1, + ALIGN(indatalen, 8), &len) || + len > MAX_CIFS_SMALL_BUFFER_SIZE))) { + cifs_small_buf_release(req); + return -EIO; + } /* * indatalen is usually small at a couple of bytes max, so * just allocate through generic pool diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 09b20039636e..611716bc8f27 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -512,6 +512,7 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len) { struct ksmbd_spnego_authen_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; int retval; @@ -540,7 +541,10 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, goto out; } - user = ksmbd_alloc_user(&resp->login_response); + if (resp->login_response.status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(resp->login_response.account); + + user = ksmbd_alloc_user(&resp->login_response, resp_ext); if (!user) { ksmbd_debug(AUTH, "login failure\n"); retval = -ENOMEM; diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 38e6fd2da3b8..3d01d9d15293 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -51,6 +51,9 @@ * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response) * This event is to make kerberos authentication to be processed in * userspace. + * + * - KSMBD_EVENT_LOGIN_REQUEST_EXT/RESPONSE_EXT(ksmbd_login_request_ext/response_ext) + * This event is to get user account extension info to user IPC daemon. */ #define KSMBD_GENL_NAME "SMBD_GENL" @@ -146,6 +149,16 @@ struct ksmbd_login_response { }; /* + * IPC user login response extension. + */ +struct ksmbd_login_response_ext { + __u32 handle; + __s32 ngroups; /* supplementary group count */ + __s8 reserved[128]; /* Reserved room */ + __s8 ____payload[]; +}; + +/* * IPC request to fetch net share config. */ struct ksmbd_share_config_request { @@ -306,6 +319,9 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, + KSMBD_EVENT_LOGIN_REQUEST_EXT, + KSMBD_EVENT_LOGIN_RESPONSE_EXT, + __KSMBD_EVENT_MAX, KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; @@ -336,6 +352,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) #define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) +#define KSMBD_USER_FLAG_EXTENSION BIT(6) /* * Share config flags. diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c index 279d00feff21..421a4a95e216 100644 --- a/fs/smb/server/mgmt/user_config.c +++ b/fs/smb/server/mgmt/user_config.c @@ -12,6 +12,7 @@ struct ksmbd_user *ksmbd_login_user(const char *account) { struct ksmbd_login_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; resp = ksmbd_ipc_login_request(account); @@ -21,15 +22,19 @@ struct ksmbd_user *ksmbd_login_user(const char *account) if (!(resp->status & KSMBD_USER_FLAG_OK)) goto out; - user = ksmbd_alloc_user(resp); + if (resp->status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(account); + + user = ksmbd_alloc_user(resp, resp_ext); out: kvfree(resp); return user; } -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext) { - struct ksmbd_user *user = NULL; + struct ksmbd_user *user; user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL); if (!user) @@ -44,18 +49,42 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) if (user->passkey) memcpy(user->passkey, resp->hash, resp->hash_sz); - if (!user->name || !user->passkey) { - kfree(user->name); - kfree(user->passkey); - kfree(user); - user = NULL; + user->ngroups = 0; + user->sgid = NULL; + + if (!user->name || !user->passkey) + goto err_free; + + if (resp_ext) { + if (resp_ext->ngroups > NGROUPS_MAX) { + pr_err("ngroups(%u) from login response exceeds max groups(%d)\n", + resp_ext->ngroups, NGROUPS_MAX); + goto err_free; + } + + user->sgid = kmemdup(resp_ext->____payload, + resp_ext->ngroups * sizeof(gid_t), + GFP_KERNEL); + if (!user->sgid) + goto err_free; + + user->ngroups = resp_ext->ngroups; + ksmbd_debug(SMB, "supplementary groups : %d\n", user->ngroups); } + return user; + +err_free: + kfree(user->name); + kfree(user->passkey); + kfree(user); + return NULL; } void ksmbd_free_user(struct ksmbd_user *user) { ksmbd_ipc_logout_request(user->name, user->flags); + kfree(user->sgid); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h index e068a19fd904..8c227b8d4954 100644 --- a/fs/smb/server/mgmt/user_config.h +++ b/fs/smb/server/mgmt/user_config.h @@ -18,6 +18,8 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + int ngroups; + gid_t *sgid; }; static inline bool user_guest(struct ksmbd_user *user) @@ -60,7 +62,8 @@ static inline unsigned int user_gid(struct ksmbd_user *user) } struct ksmbd_user *ksmbd_login_user(const char *account); -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 99416ce9f501..1e4624e9d434 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -177,9 +177,10 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (sess->state != SMB2_SESSION_VALID || - time_after(jiffies, - sess->last_active + SMB2_SESSION_TIMEOUT)) { + if (atomic_read(&sess->refcnt) == 0 && + (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT))) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); ksmbd_session_destroy(sess); @@ -269,8 +270,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); - if (sess) - sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -289,6 +288,22 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, return sess; } +void ksmbd_user_session_get(struct ksmbd_session *sess) +{ + atomic_inc(&sess->refcnt); +} + +void ksmbd_user_session_put(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (atomic_read(&sess->refcnt) <= 0) + WARN_ON(1); + else + atomic_dec(&sess->refcnt); +} + struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, u64 sess_id) { @@ -393,6 +408,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); + atomic_set(&sess->refcnt, 1); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index dc9fded2cd43..c1c4b20bd5c6 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -61,6 +61,8 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; rwlock_t tree_conns_lock; + + atomic_t refcnt; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +void ksmbd_user_session_get(struct ksmbd_session *sess); +void ksmbd_user_session_put(struct ksmbd_session *sess); #endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 231d2d224656..9670c97f14b3 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -238,6 +238,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: + if (work->sess) + ksmbd_user_session_put(work->sess); if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 797b0f24097b..599118aed205 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -605,8 +605,10 @@ int smb2_check_user_session(struct ksmbd_work *work) /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); - if (work->sess) + if (work->sess) { + ksmbd_user_session_get(work->sess); return 1; + } ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); return -ENOENT; } @@ -1740,6 +1742,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = true; + ksmbd_user_session_get(sess); } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { @@ -1766,6 +1769,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = false; + ksmbd_user_session_get(sess); } work->sess = sess; @@ -2228,7 +2232,9 @@ int smb2_session_logoff(struct ksmbd_work *work) } ksmbd_destroy_file_table(&sess->file_table); + down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; + up_write(&conn->session_lock); ksmbd_free_user(sess->user); sess->user = NULL; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 5b8d75e78ffb..a2ebbe604c8c 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -736,13 +736,15 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; + struct ksmbd_user *user = sess->user; struct cred *cred; struct group_info *gi; unsigned int uid; unsigned int gid; + int i; - uid = user_uid(sess->user); - gid = user_gid(sess->user); + uid = user_uid(user); + gid = user_gid(user); if (share->force_uid != KSMBD_SHARE_INVALID_UID) uid = share->force_uid; if (share->force_gid != KSMBD_SHARE_INVALID_GID) @@ -755,11 +757,18 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, gid); - gi = groups_alloc(0); + gi = groups_alloc(user->ngroups); if (!gi) { abort_creds(cred); return -ENOMEM; } + + for (i = 0; i < user->ngroups; i++) + gi->gid[i] = make_kgid(&init_user_ns, user->sgid[i]); + + if (user->ngroups) + groups_sort(gi); + set_groups(cred, gi); put_group_info(gi); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 8752ac82c557..2f27afb695f6 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -120,6 +120,12 @@ static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { }, + [KSMBD_EVENT_LOGIN_REQUEST_EXT] = { + .len = sizeof(struct ksmbd_login_request), + }, + [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = { + .len = sizeof(struct ksmbd_login_response_ext), + }, }; static struct genl_ops ksmbd_genl_ops[] = { @@ -187,6 +193,14 @@ static struct genl_ops ksmbd_genl_ops[] = { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, .doit = handle_generic_event, }, + { + .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT, + .doit = handle_generic_event, + }, }; static struct genl_family ksmbd_genl_family = { @@ -198,7 +212,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), - .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, + .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1, }; static void ksmbd_nl_init_fixup(void) @@ -459,16 +473,24 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) { unsigned int msg_sz = entry->msg_sz; - if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + switch (entry->type) { + case KSMBD_EVENT_RPC_REQUEST: + { struct ksmbd_rpc_command *resp = entry->response; msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; - } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + break; + } + case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST: + { struct ksmbd_spnego_authen_response *resp = entry->response; msg_sz = sizeof(struct ksmbd_spnego_authen_response) + resp->session_key_len + resp->spnego_blob_len; - } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + break; + } + case KSMBD_EVENT_SHARE_CONFIG_REQUEST: + { struct ksmbd_share_config_response *resp = entry->response; if (resp->payload_sz) { @@ -478,6 +500,17 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) msg_sz = sizeof(struct ksmbd_share_config_response) + resp->payload_sz; } + break; + } + case KSMBD_EVENT_LOGIN_REQUEST_EXT: + { + struct ksmbd_login_response_ext *resp = entry->response; + + if (resp->ngroups) { + msg_sz = sizeof(struct ksmbd_login_response_ext) + + resp->ngroups * sizeof(gid_t); + } + } } return entry->msg_sz != msg_sz ? -EINVAL : 0; @@ -560,6 +593,29 @@ struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) return resp; } +struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_login_request *req; + struct ksmbd_login_response_ext *resp; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT; + req = (struct ksmbd_login_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + struct ksmbd_spnego_authen_response * ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) { diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h index 5e5b90a0c187..d9b6737f8cd0 100644 --- a/fs/smb/server/transport_ipc.h +++ b/fs/smb/server/transport_ipc.h @@ -12,6 +12,8 @@ struct ksmbd_login_response * ksmbd_ipc_login_request(const char *account); +struct ksmbd_login_response_ext * +ksmbd_ipc_login_request_ext(const char *account); struct ksmbd_session; struct ksmbd_share_config; diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 49dc38acc66b..4505f4829d53 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -801,7 +801,7 @@ xrep_bmap( { struct xrep_bmap *rb; char *descr; - unsigned int max_bmbt_recs; + xfs_extnum_t max_bmbt_recs; bool large_extcount; int error = 0; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6dead20338e2..559a3a577097 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -116,7 +116,7 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, offset, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); } goto done; @@ -456,7 +456,7 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - xfs_bmap_punch_delalloc_range(ip, pos, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, folio_pos(folio) + folio_size(folio)); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 053d567c9108..4719ec90029c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -442,11 +442,12 @@ out_unlock_iolock: void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, + int whichfork, xfs_off_t start_byte, xfs_off_t end_byte) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; @@ -474,11 +475,14 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } + if (whichfork == XFS_COW_FORK && !ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -580,7 +584,7 @@ xfs_free_eofblocks( */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_delayed_blks) { - xfs_bmap_punch_delalloc_range(ip, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), LLONG_MAX); } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index eb0895bfb9da..b29760d36e1a 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 412b1d71b52b..b19916b11fd5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -348,9 +348,82 @@ xfs_file_splice_read( } /* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + +/* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -360,13 +433,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -389,7 +459,7 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -400,64 +470,24 @@ restart: } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. + * write. * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. - * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1e11f48814c0..916531d9f83c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -975,6 +975,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + unsigned int iomap_flags = 0; u64 seq; if (xfs_is_shutdown(mp)) @@ -1145,6 +1146,11 @@ xfs_buffered_write_iomap_begin( } } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap_flags |= IOMAP_F_NEW; if (allocfork == XFS_COW_FORK) { error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, end_fsb - offset_fsb, prealloc_blocks, &cmap, @@ -1162,19 +1168,11 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); - found_imap: - seq = xfs_iomap_inode_sequence(ip, 0); + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); convert_delay: xfs_iunlock(ip, lockmode); @@ -1188,20 +1186,20 @@ convert_delay: return 0; found_cow: - seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); if (error) goto out_unlock; - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED, seq); + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); } - xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + iomap_flags |= IOMAP_F_SHARED; + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq); out_unlock: xfs_iunlock(ip, lockmode); @@ -1215,7 +1213,10 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { - xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), + (iomap->flags & IOMAP_F_SHARED) ? + XFS_COW_FORK : XFS_DATA_FORK, + offset, offset + length); } static int @@ -1227,8 +1228,30 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - iomap_file_buffered_write_punch_delalloc(inode, offset, length, written, - flags, iomap, &xfs_buffered_write_delalloc_punch); + loff_t start_byte, end_byte; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* Nothing to do if we've written the entire delalloc extent */ + start_byte = iomap_last_written_block(inode, offset, written); + end_byte = round_up(offset + length, i_blocksize(inode)); + if (start_byte >= end_byte) + return 0; + + /* For zeroing operations the callers already hold invalidate_lock. */ + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { + rwsem_assert_held_write(&inode->i_mapping->invalidate_lock); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + } else { + filemap_invalidate_lock(inode->i_mapping); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + filemap_invalidate_unlock(inode->i_mapping); + } + return 0; } @@ -1435,6 +1458,8 @@ xfs_zero_range( { struct inode *inode = VFS_I(ip); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); |