diff options
Diffstat (limited to 'fs')
40 files changed, 507 insertions, 219 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4ff56fa4d539..534ba2b02bd6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -244,10 +244,10 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, + bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, alloc_v4_val_size_bad, "bad val size (%u > %zu)", - alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 052b2fac25d6..2790e516383d 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -126,13 +126,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_ return pos; } -static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) { - unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: BCH_ALLOC_V4_U64s_V0) + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * (sizeof(struct bch_backpointer) / sizeof(u64)); +} +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = alloc_v4_u64s_noerror(a); BUG_ON(ret > U8_MAX - BKEY_U64s); return ret; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index a20044201002..af7a71de1bdf 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -54,7 +54,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)), c, err, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 85949b9fd880..c1b274eadda1 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -45,6 +45,15 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, + struct bpos bucket, + u64 bucket_offset) +{ + return POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); +} + /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ @@ -53,10 +62,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, u64 bucket_offset) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); - + struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); return ret; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f7fbfccd2b1e..2e8b1a489c20 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -591,6 +591,12 @@ struct bch_member { __le64 btree_allocated_bitmap; }; +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + #define BCH_MEMBER_V1_BYTES 56 LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) @@ -897,6 +903,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define BCH_SB_SECTOR 8 #define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ +#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ + struct bch_sb_layout { __uuid_t magic; /* bcachefs superblock UUID */ __u8 layout_type; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index db336a43fc08..a275a9e8e341 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -171,8 +171,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((type == BKEY_TYPE_btree || - (flags & BKEY_INVALID_COMMIT)) && + bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && + (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index e8c1c530cd95..7dafa1accec2 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -956,13 +956,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #ifdef __KERNEL__ - for_each_possible_cpu(cpu) { - struct btree_key_cache_freelist *f = - per_cpu_ptr(bc->pcpu_freed, cpu); - - for (i = 0; i < f->nr; i++) { - ck = f->objs[i]; - list_add(&ck->list, &items); + if (bc->pcpu_freed) { + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); + + for (i = 0; i < f->nr; i++) { + ck = f->objs[i]; + list_add(&ck->list, &items); + } } } #endif diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 7ed779b411f6..088fd2e7bdf1 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -102,6 +102,7 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, int ret; skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 01a79fa3eacb..dbe35b80bc0b 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -175,6 +175,7 @@ x(EINVAL, block_size_too_small) \ x(EINVAL, bucket_size_too_small) \ x(EINVAL, device_size_too_small) \ + x(EINVAL, device_size_too_big) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index fce690007edf..6f114803c6f2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -964,7 +964,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; u32 snapshot; @@ -974,6 +973,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (ret) return ret; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f137252bccc5..40d7df7607df 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -199,9 +199,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, u64 new_i_size, s64 i_sectors_delta) { - struct btree_iter iter; - struct bkey_i *k; - struct bkey_i_inode_v3 *inode; /* * Crazy performance optimization: * Every extent update needs to also update the inode: the inode trigger @@ -214,25 +211,36 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * lost, but that's fine. */ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; - int ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), BTREE_ITER_CACHED); - ret = PTR_ERR_OR_ZERO(k); + int ret = bkey_err(k); if (unlikely(ret)) return ret; - if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = PTR_ERR_OR_ZERO(k); + /* + * varint_decode_fast(), in the inode .invalid method, reads up to 7 + * bytes past the end of the buffer: + */ + struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); + ret = PTR_ERR_OR_ZERO(k_mut); + if (unlikely(ret)) + goto err; + + bkey_reassemble(k_mut, k); + + if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { + k_mut = bch2_inode_to_v3(trans, k_mut); + ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err; } - inode = bkey_i_to_inode_v3(k); + struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { @@ -1505,6 +1513,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; + memset(&op->failed, 0, sizeof(op->failed)); + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; op->flags |= BCH_WRITE_DONE; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 9c9a25dbd613..a8b08e76d0d0 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -706,6 +706,12 @@ recheck_need_open: spin_unlock(&j->lock); + /* + * We're called from bch2_journal_flush_seq() -> wait_event(); + * but this might block. We won't usually block, so we won't + * livelock: + */ + sched_annotate_sleep(); ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; @@ -870,6 +876,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou { struct journal_buf *ret = NULL; + /* We're inside wait_event(), but using mutex_lock(: */ + sched_annotate_sleep(); mutex_lock(&j->buf_lock); spin_lock(&j->lock); max_seq = min(max_seq, journal_cur_seq(j)); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index bf68ea49447b..4d94b7742dbb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -968,24 +968,30 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg, return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +/* + * Ancient versions of bcachefs produced packed formats which could represent + * keys that the in memory format cannot represent; this checks for those + * formats so we can get rid of them. + */ static bool bformat_needs_redo(struct bkey_format *f) { - unsigned i; - - for (i = 0; i < f->nr_fields; i++) { + for (unsigned i = 0; i < f->nr_fields; i++) { + unsigned f_bits = f->bits_per_field[i]; unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (f->bits_per_field[i] > unpacked_bits) + if (f_bits > unpacked_bits) return true; - if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + if ((f_bits == unpacked_bits) && field_offset) return true; - if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & - unpacked_mask) < - field_offset) + u64 f_mask = f_bits + ? ~((~0ULL << (f_bits - 1)) << 1) + : 0; + + if (((field_offset + f_mask) & unpacked_mask) < field_offset) return true; } diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index e68b34eab90a..556da0738106 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -560,13 +560,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct bch_snapshot_tree s_t; - int ret; + u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); - ret = bch2_snapshot_tree_lookup(trans, - bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, - snapshot_t(c, k.k->p.snapshot)->tree); + "%s: snapshot tree %u not found", __func__, tree); if (ret) return ret; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index be5b47619327..8091d0686029 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -902,7 +902,8 @@ out: bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); } - kfree(clean); + if (!IS_ERR(clean)) + kfree(clean); if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 35ca3f138de6..194e55b11137 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -278,6 +278,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, return -BCH_ERR_invalid_sb_clean; } + for (struct jset_entry *entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { + prt_str(err, "entry type "); + bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type)); + prt_str(err, " overruns end of section"); + return -BCH_ERR_invalid_sb_clean; + } + } + return 0; } @@ -295,6 +306,9 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { + if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) + break; + if (entry->type == BCH_JSET_ENTRY_btree_keys && !entry->u64s) continue; diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 5b8e621ac5eb..44b3f0cb7b49 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -124,9 +124,9 @@ static int validate_member(struct printbuf *err, struct bch_sb *sb, int i) { - if (le64_to_cpu(m.nbuckets) > LONG_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", - i, le64_to_cpu(m.nbuckets), LONG_MAX); + if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %u)", + i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); return -BCH_ERR_invalid_sb_members; } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 5efa64eca5f8..5bf27d30ca29 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -107,10 +107,10 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { + rcu_read_lock(); if (ca) percpu_ref_put(&ca->ref); - rcu_read_lock(); if ((ca = __bch2_next_dev(c, ca, NULL))) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -132,10 +132,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, unsigned state_mask) { + rcu_read_lock(); if (ca) percpu_ref_put(&ca->io_ref); - rcu_read_lock(); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 08ea3dbbbe97..bfdb15e7d778 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -232,7 +232,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); return NULL; } } @@ -649,7 +649,7 @@ reread: bytes = vstruct_bytes(sb->sb); - if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { + if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", bytes, 512UL << sb->sb->layout.sb_max_size_bits); return -BCH_ERR_invalid_sb_too_big; @@ -923,6 +923,7 @@ int bch2_write_super(struct bch_fs *c) struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + DARRAY(struct bch_dev *) online_devices = {}; int ret = 0; trace_and_count(c, write_super, c, _RET_IP_); @@ -935,6 +936,15 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + for_each_online_member(c, ca) { + ret = darray_push(&online_devices, ca); + if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { + percpu_ref_put(&ca->io_ref); + goto out; + } + percpu_ref_get(&ca->io_ref); + } + /* Make sure we're using the new magic numbers: */ c->disk_sb.sb->magic = BCHFS_MAGIC; c->disk_sb.sb->layout.magic = BCHFS_MAGIC; @@ -942,8 +952,8 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - for_each_online_member(c, ca) - __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + darray_for_each(online_devices, ca) + __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq; c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); if (test_bit(BCH_FS_error, &c->flags)) @@ -959,16 +969,15 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_errors_from_cpu(c); bch2_sb_downgrade_update(c); - for_each_online_member(c, ca) - bch2_sb_from_fs(c, ca); + darray_for_each(online_devices, ca) + bch2_sb_from_fs(c, (*ca)); - for_each_online_member(c, ca) { + darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - percpu_ref_put(&ca->io_ref); goto out; } } @@ -995,16 +1004,18 @@ int bch2_write_super(struct bch_fs *c) return -BCH_ERR_sb_not_downgraded; } - for_each_online_member(c, ca) { - __set_bit(ca->dev_idx, sb_written.d); - ca->sb_write_error = 0; + darray_for_each(online_devices, ca) { + __set_bit((*ca)->dev_idx, sb_written.d); + (*ca)->sb_write_error = 0; } - for_each_online_member(c, ca) - read_back_super(c, ca); + darray_for_each(online_devices, ca) + read_back_super(c, *ca); closure_sync(cl); - for_each_online_member(c, ca) { + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; + if (ca->sb_write_error) continue; @@ -1031,17 +1042,20 @@ int bch2_write_super(struct bch_fs *c) do { wrote = false; - for_each_online_member(c, ca) + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; if (!ca->sb_write_error && sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); wrote = true; } + } closure_sync(cl); sb++; } while (wrote); - for_each_online_member(c, ca) { + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); else @@ -1077,6 +1091,9 @@ int bch2_write_super(struct bch_fs *c) out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + darray_for_each(online_devices, ca) + percpu_ref_put(&(*ca)->io_ref); + darray_exit(&online_devices); printbuf_exit(&err); return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 88e214c609bb..dddf57ec4511 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1959,6 +1959,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) goto err; } + if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { + bch_err(ca, "New device size too big (%llu greater than max %u)", + nbuckets, BCH_MEMBER_NBUCKETS_MAX); + ret = -BCH_ERR_device_size_too_big; + goto err; + } + if (bch2_dev_is_online(ca) && get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { @@ -2004,13 +2011,9 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) - if (!strcmp(name, ca->name)) { - rcu_read_unlock(); + for_each_member_device(c, ca) + if (!strcmp(name, ca->name)) return ca; - } - rcu_read_unlock(); return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 55f3ba6a831c..0493272a7668 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3758,15 +3758,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - down_write(&fs_info->subvol_sem); - switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + down_write(&fs_info->subvol_sem); ret = btrfs_quota_enable(fs_info, sa); + up_write(&fs_info->subvol_sem); break; case BTRFS_QUOTA_CTL_DISABLE: + /* + * Lock the cleaner mutex to prevent races with concurrent + * relocation, because relocation may be building backrefs for + * blocks of the quota root while we are deleting the root. This + * is like dropping fs roots of deleted snapshots/subvolumes, we + * need the same protection. + * + * This also prevents races between concurrent tasks trying to + * disable quotas, because we will unlock and relock + * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * + * We take this here because we have the dependency of + * + * inode_lock -> subvol_sem + * + * because of rename. With relocation we can prealloc extents, + * so that makes the dependency chain + * + * cleaner_mutex -> inode_lock -> subvol_sem + * + * so we must take the cleaner_mutex here before we take the + * subvol_sem. The deadlock can't actually happen, but this + * quiets lockdep. + */ + mutex_lock(&fs_info->cleaner_mutex); + down_write(&fs_info->subvol_sem); ret = btrfs_quota_disable(fs_info); + up_write(&fs_info->subvol_sem); + mutex_unlock(&fs_info->cleaner_mutex); break; default: ret = -EINVAL; @@ -3774,7 +3802,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) } kfree(sa); - up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b749ba45da2b..c2a42bcde98e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1188,6 +1188,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( ordered->disk_bytenr += len; ordered->num_bytes -= len; ordered->disk_num_bytes -= len; + ordered->ram_bytes -= len; if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { ASSERT(ordered->bytes_left == 0); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cf8820ce7aa2..40e5f7f2fcb7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1342,16 +1342,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) lockdep_assert_held_write(&fs_info->subvol_sem); /* - * Lock the cleaner mutex to prevent races with concurrent relocation, - * because relocation may be building backrefs for blocks of the quota - * root while we are deleting the root. This is like dropping fs roots - * of deleted snapshots/subvolumes, we need the same protection. - * - * This also prevents races between concurrent tasks trying to disable - * quotas, because we will unlock and relock qgroup_ioctl_lock across - * BTRFS_FS_QUOTA_ENABLED changes. + * Relocation will mess with backrefs, so make sure we have the + * cleaner_mutex held to protect us from relocate. */ - mutex_lock(&fs_info->cleaner_mutex); + lockdep_assert_held(&fs_info->cleaner_mutex); mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) @@ -1373,9 +1367,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); + /* + * We have nothing held here and no trans handle, just return the error + * if there is one. + */ ret = flush_reservations(fs_info); if (ret) - goto out_unlock_cleaner; + return ret; /* * 1 For the root item @@ -1439,9 +1437,6 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_commit_transaction(trans); -out_unlock_cleaner: - mutex_unlock(&fs_info->cleaner_mutex); - return ret; } @@ -3050,6 +3045,8 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { + if (!btrfs_qgroup_enabled(fs_info)) + return 0; if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c8fbcae4e88e..32604e9b31c3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1797,6 +1797,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_LEVEL; } + if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -1858,6 +1863,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; + enum btrfs_tree_block_status ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1913,21 +1919,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - /* - * We only want to do this if WRITTEN is set, otherwise the leaf - * may be in some intermediate state and won't appear valid. - */ - if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { - enum btrfs_tree_block_status ret; - - /* - * Check if the item size and content meet other - * criteria - */ - ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) - return ret; - } + /* Check if the item size and content meet other criteria. */ + ret = check_leaf_item(leaf, &key, slot, &prev_key); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return ret; prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -1957,6 +1952,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) int level = btrfs_header_level(node); u64 bytenr; + if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(node, 0, "invalid flag for node, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 5c809b50b2d0..01669cfa6578 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -53,6 +53,7 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, BTRFS_TREE_BLOCK_INVALID_ITEM, BTRFS_TREE_BLOCK_INVALID_OWNER, + BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f15591f3e54f..ef6bd2f4251b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3455,6 +3455,7 @@ again: * alignment and size). */ ret = -EUCLEAN; + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 882b89edc52a..f53ca4f7fced 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -980,6 +980,34 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep } /* + * The ffd.file pointer may be in the process of being torn down due to + * being closed, but we may not have finished eventpoll_release() yet. + * + * Normally, even with the atomic_long_inc_not_zero, the file may have + * been free'd and then gotten re-allocated to something else (since + * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * + * But for epoll, users hold the ep->mtx mutex, and as such any file in + * the process of being free'd will block in eventpoll_release_file() + * and thus the underlying file allocation will not be free'd, and the + * file re-use cannot happen. + * + * For the same reason we can avoid a rcu_read_lock() around the + * operation - 'ffd.file' cannot go away even if the refcount has + * reached zero (but we must still not call out to ->poll() functions + * etc). + */ +static struct file *epi_fget(const struct epitem *epi) +{ + struct file *file; + + file = epi->ffd.file; + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + return file; +} + +/* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. @@ -987,14 +1015,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { - struct file *file = epi->ffd.file; + struct file *file = epi_fget(epi); __poll_t res; + /* + * We could return EPOLLERR | EPOLLHUP or something, but let's + * treat this more as "file doesn't exist, poll didn't happen". + */ + if (!file) + return 0; + pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); + fput(file); return res & epi->event.events; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 077944d3c2c0..84572e11cc05 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -420,6 +420,7 @@ static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) static void exfat_init_stream_entry(struct exfat_dentry *ep, unsigned int start_clu, unsigned long long size) { + memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, TYPE_STREAM); if (size == 0) ep->dentry.stream.flags = ALLOC_FAT_CHAIN; @@ -457,6 +458,7 @@ void exfat_init_dir_entry(struct exfat_entry_set_cache *es, struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); + memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, type); exfat_set_entry_time(sbi, ts, &ep->dentry.file.create_tz, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index cc00f1a7a1e1..9adfc38ca7da 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -51,7 +51,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) clu.flags = ei->flags; ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters, - &clu, IS_DIRSYNC(inode)); + &clu, inode_needs_sync(inode)); if (ret) return ret; @@ -77,12 +77,11 @@ out: ei->i_size_aligned = round_up(size, sb->s_blocksize); ei->i_size_ondisk = ei->i_size_aligned; inode->i_blocks = round_up(size, sbi->cluster_size) >> 9; + mark_inode_dirty(inode); - if (IS_DIRSYNC(inode)) + if (IS_SYNC(inode)) return write_inode_now(inode, 1); - mark_inode_dirty(inode); - return 0; free_clu: diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 1567f0323858..9666d13884ce 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -225,7 +225,7 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) goto out; res = -EINVAL; - if (map->flags) + if (map->flags || map->padding) goto out; file = fget(map->fd); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 322af827a232..bb3e941b9503 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -170,7 +170,7 @@ static ssize_t tag_show(struct kobject *kobj, { struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); - return sysfs_emit(buf, fs->tag); + return sysfs_emit(buf, "%s\n", fs->tag); } static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 4978edfb15f9..b9d9116fc2b3 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -207,9 +207,9 @@ static void opinfo_add(struct oplock_info *opinfo) { struct ksmbd_inode *ci = opinfo->o_fp->f_ci; - write_lock(&ci->m_lock); + down_write(&ci->m_lock); list_add_rcu(&opinfo->op_entry, &ci->m_op_list); - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); } static void opinfo_del(struct oplock_info *opinfo) @@ -221,9 +221,9 @@ static void opinfo_del(struct oplock_info *opinfo) lease_del_list(opinfo); write_unlock(&lease_list_lock); } - write_lock(&ci->m_lock); + down_write(&ci->m_lock); list_del_rcu(&opinfo->op_entry); - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); } static unsigned long opinfo_count(struct ksmbd_file *fp) @@ -526,21 +526,18 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, * Compare lease key and client_guid to know request from same owner * of same client */ - read_lock(&ci->m_lock); + down_read(&ci->m_lock); list_for_each_entry(opinfo, &ci->m_op_list, op_entry) { if (!opinfo->is_lease || !opinfo->conn) continue; - read_unlock(&ci->m_lock); lease = opinfo->o_lease; ret = compare_guid_key(opinfo, client_guid, lctx->lease_key); if (ret) { m_opinfo = opinfo; /* skip upgrading lease about breaking lease */ - if (atomic_read(&opinfo->breaking_cnt)) { - read_lock(&ci->m_lock); + if (atomic_read(&opinfo->breaking_cnt)) continue; - } /* upgrading lease */ if ((atomic_read(&ci->op_count) + @@ -570,9 +567,8 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, lease_none_upgrade(opinfo, lctx->req_state); } } - read_lock(&ci->m_lock); } - read_unlock(&ci->m_lock); + up_read(&ci->m_lock); return m_opinfo; } @@ -613,13 +609,23 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level) if (opinfo->op_state == OPLOCK_CLOSING) return -ENOENT; - else if (!opinfo->is_lease && opinfo->level <= req_op_level) - return 1; + else if (opinfo->level <= req_op_level) { + if (opinfo->is_lease && + opinfo->o_lease->state != + (SMB2_LEASE_HANDLE_CACHING_LE | + SMB2_LEASE_READ_CACHING_LE)) + return 1; + } } - if (!opinfo->is_lease && opinfo->level <= req_op_level) { - wake_up_oplock_break(opinfo); - return 1; + if (opinfo->level <= req_op_level) { + if (opinfo->is_lease && + opinfo->o_lease->state != + (SMB2_LEASE_HANDLE_CACHING_LE | + SMB2_LEASE_READ_CACHING_LE)) { + wake_up_oplock_break(opinfo); + return 1; + } } return 0; } @@ -887,7 +893,6 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level) struct lease *lease = brk_opinfo->o_lease; atomic_inc(&brk_opinfo->breaking_cnt); - err = oplock_break_pending(brk_opinfo, req_op_level); if (err) return err < 0 ? err : 0; @@ -1105,7 +1110,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, if (!p_ci) return; - read_lock(&p_ci->m_lock); + down_read(&p_ci->m_lock); list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) { if (opinfo->conn == NULL || !opinfo->is_lease) continue; @@ -1123,13 +1128,11 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, continue; } - read_unlock(&p_ci->m_lock); oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); opinfo_conn_put(opinfo); - read_lock(&p_ci->m_lock); } } - read_unlock(&p_ci->m_lock); + up_read(&p_ci->m_lock); ksmbd_inode_put(p_ci); } @@ -1150,7 +1153,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) if (!p_ci) return; - read_lock(&p_ci->m_lock); + down_read(&p_ci->m_lock); list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) { if (opinfo->conn == NULL || !opinfo->is_lease) continue; @@ -1164,13 +1167,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) atomic_dec(&opinfo->conn->r_count); continue; } - read_unlock(&p_ci->m_lock); oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); opinfo_conn_put(opinfo); - read_lock(&p_ci->m_lock); } } - read_unlock(&p_ci->m_lock); + up_read(&p_ci->m_lock); ksmbd_inode_put(p_ci); } @@ -1200,7 +1201,9 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, /* Only v2 leases handle the directory */ if (S_ISDIR(file_inode(fp->filp)->i_mode)) { - if (!lctx || lctx->version != 2) + if (!lctx || lctx->version != 2 || + (lctx->flags != SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE && + !lctx->epoch)) return 0; } @@ -1465,8 +1468,9 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) buf->lcontext.LeaseFlags = lease->flags; buf->lcontext.Epoch = cpu_to_le16(lease->epoch); buf->lcontext.LeaseState = lease->state; - memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key, - SMB2_LEASE_KEY_SIZE); + if (lease->flags == SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) + memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key, + SMB2_LEASE_KEY_SIZE); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_lease_v2, lcontext)); buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2)); @@ -1525,8 +1529,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req) lreq->flags = lc->lcontext.LeaseFlags; lreq->epoch = lc->lcontext.Epoch; lreq->duration = lc->lcontext.LeaseDuration; - memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, - SMB2_LEASE_KEY_SIZE); + if (lreq->flags == SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) + memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, + SMB2_LEASE_KEY_SIZE); lreq->version = 2; } else { struct create_lease *lc = (struct create_lease *)cc; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 355824151c2d..b6c5a8ea3887 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1926,7 +1926,7 @@ int smb2_tree_connect(struct ksmbd_work *work) struct ksmbd_session *sess = work->sess; char *treename = NULL, *name = NULL; struct ksmbd_tree_conn_status status; - struct ksmbd_share_config *share; + struct ksmbd_share_config *share = NULL; int rc = -EINVAL; WORK_BUFFERS(work, req, rsp); @@ -1988,7 +1988,7 @@ int smb2_tree_connect(struct ksmbd_work *work) write_unlock(&sess->tree_conns_lock); rsp->StructureSize = cpu_to_le16(16); out_err1: - if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE && + if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE && share && test_share_config_flag(share, KSMBD_SHARE_FLAG_CONTINUOUS_AVAILABILITY)) rsp->Capabilities = SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY; @@ -3376,9 +3376,9 @@ int smb2_open(struct ksmbd_work *work) * after daccess, saccess, attrib_only, and stream are * initialized. */ - write_lock(&fp->f_ci->m_lock); + down_write(&fp->f_ci->m_lock); list_add(&fp->node, &fp->f_ci->m_fp_list); - write_unlock(&fp->f_ci->m_lock); + up_write(&fp->f_ci->m_lock); /* Check delete pending among previous fp before oplock break */ if (ksmbd_inode_pending_delete(fp)) { diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index fcaf373cc008..474dadf6b7b8 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -646,7 +646,7 @@ int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp) * Lookup fp in master fp list, and check desired access and * shared mode between previous open and current open. */ - read_lock(&curr_fp->f_ci->m_lock); + down_read(&curr_fp->f_ci->m_lock); list_for_each_entry(prev_fp, &curr_fp->f_ci->m_fp_list, node) { if (file_inode(filp) != file_inode(prev_fp->filp)) continue; @@ -722,7 +722,7 @@ int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp) break; } } - read_unlock(&curr_fp->f_ci->m_lock); + up_read(&curr_fp->f_ci->m_lock); return rc; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 002a3f0dc7c5..6633fa78e9b9 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -448,6 +448,10 @@ static int create_socket(struct interface *iface) sin6.sin6_family = PF_INET6; sin6.sin6_addr = in6addr_any; sin6.sin6_port = htons(server_conf.tcp_port); + + lock_sock(ksmbd_socket->sk); + ksmbd_socket->sk->sk_ipv6only = false; + release_sock(ksmbd_socket->sk); } ksmbd_tcp_nodelay(ksmbd_socket); diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 030f70700036..6cb599cd287e 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -165,7 +165,7 @@ static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp) ci->m_fattr = 0; INIT_LIST_HEAD(&ci->m_fp_list); INIT_LIST_HEAD(&ci->m_op_list); - rwlock_init(&ci->m_lock); + init_rwsem(&ci->m_lock); ci->m_de = fp->filp->f_path.dentry; return 0; } @@ -261,14 +261,14 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) } if (atomic_dec_and_test(&ci->m_count)) { - write_lock(&ci->m_lock); + down_write(&ci->m_lock); if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) { ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); ksmbd_vfs_unlink(filp); - write_lock(&ci->m_lock); + down_write(&ci->m_lock); } - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); ksmbd_inode_free(ci); } @@ -289,9 +289,9 @@ static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp if (!has_file_id(fp->volatile_id)) return; - write_lock(&fp->f_ci->m_lock); + down_write(&fp->f_ci->m_lock); list_del_init(&fp->node); - write_unlock(&fp->f_ci->m_lock); + up_write(&fp->f_ci->m_lock); write_lock(&ft->lock); idr_remove(ft->idr, fp->volatile_id); @@ -523,17 +523,17 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry) if (!ci) return NULL; - read_lock(&ci->m_lock); + down_read(&ci->m_lock); list_for_each_entry(lfp, &ci->m_fp_list, node) { if (inode == file_inode(lfp->filp)) { atomic_dec(&ci->m_count); lfp = ksmbd_fp_get(lfp); - read_unlock(&ci->m_lock); + up_read(&ci->m_lock); return lfp; } } atomic_dec(&ci->m_count); - read_unlock(&ci->m_lock); + up_read(&ci->m_lock); return NULL; } @@ -705,13 +705,13 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, conn = fp->conn; ci = fp->f_ci; - write_lock(&ci->m_lock); + down_write(&ci->m_lock); list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn != conn) continue; op->conn = NULL; } - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); fp->conn = NULL; fp->tcon = NULL; @@ -801,13 +801,13 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) fp->tcon = work->tcon; ci = fp->f_ci; - write_lock(&ci->m_lock); + down_write(&ci->m_lock); list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn) continue; op->conn = fp->conn; } - write_unlock(&ci->m_lock); + up_write(&ci->m_lock); __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID); if (!has_file_id(fp->volatile_id)) { diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index ed44fb4e18e7..5a225e7055f1 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -47,7 +47,7 @@ struct stream { }; struct ksmbd_inode { - rwlock_t m_lock; + struct rw_semaphore m_lock; atomic_t m_count; atomic_t op_count; /* opinfo count for streams */ diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 894c6ca1e500..a878cea70f4c 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -37,6 +37,7 @@ static DEFINE_MUTEX(eventfs_mutex); struct eventfs_root_inode { struct eventfs_inode ei; + struct inode *parent_inode; struct dentry *events_dir; }; @@ -68,11 +69,25 @@ enum { EVENTFS_SAVE_MODE = BIT(16), EVENTFS_SAVE_UID = BIT(17), EVENTFS_SAVE_GID = BIT(18), - EVENTFS_TOPLEVEL = BIT(19), }; #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) +static void free_ei_rcu(struct rcu_head *rcu) +{ + struct eventfs_inode *ei = container_of(rcu, struct eventfs_inode, rcu); + struct eventfs_root_inode *rei; + + kfree(ei->entry_attrs); + kfree_const(ei->name); + if (ei->is_events) { + rei = get_root_inode(ei); + kfree(rei); + } else { + kfree(ei); + } +} + /* * eventfs_inode reference count management. * @@ -84,18 +99,17 @@ enum { static void release_ei(struct kref *ref) { struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); - struct eventfs_root_inode *rei; + const struct eventfs_entry *entry; WARN_ON_ONCE(!ei->is_freed); - kfree(ei->entry_attrs); - kfree_const(ei->name); - if (ei->is_events) { - rei = get_root_inode(ei); - kfree_rcu(rei, ei.rcu); - } else { - kfree_rcu(ei, rcu); + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (entry->release) + entry->release(entry->name, ei->data); } + + call_rcu(&ei->rcu, free_ei_rcu); } static inline void put_ei(struct eventfs_inode *ei) @@ -112,6 +126,18 @@ static inline void free_ei(struct eventfs_inode *ei) } } +/* + * Called when creation of an ei fails, do not call release() functions. + */ +static inline void cleanup_ei(struct eventfs_inode *ei) +{ + if (ei) { + /* Set nr_entries to 0 to prevent release() function being called */ + ei->nr_entries = 0; + free_ei(ei); + } +} + static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) { if (ei) @@ -181,21 +207,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, * determined by the parent directory. */ if (dentry->d_inode->i_mode & S_IFDIR) { - /* - * The events directory dentry is never freed, unless its - * part of an instance that is deleted. It's attr is the - * default for its child files and directories. - * Do not update it. It's not used for its own mode or ownership. - */ - if (ei->is_events) { - /* But it still needs to know if it was modified */ - if (iattr->ia_valid & ATTR_UID) - ei->attr.mode |= EVENTFS_SAVE_UID; - if (iattr->ia_valid & ATTR_GID) - ei->attr.mode |= EVENTFS_SAVE_GID; - } else { - update_attr(&ei->attr, iattr); - } + update_attr(&ei->attr, iattr); } else { name = dentry->d_name.name; @@ -213,18 +225,25 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb) +static void update_events_attr(struct eventfs_inode *ei, struct super_block *sb) { - struct inode *root; + struct eventfs_root_inode *rei; + struct inode *parent; - /* Only update if the "events" was on the top level */ - if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) - return; + rei = get_root_inode(ei); - /* Get the tracefs root inode. */ - root = d_inode(sb->s_root); - ei->attr.uid = root->i_uid; - ei->attr.gid = root->i_gid; + /* Use the parent inode permissions unless root set its permissions */ + parent = rei->parent_inode; + + if (rei->ei.attr.mode & EVENTFS_SAVE_UID) + ei->attr.uid = rei->ei.attr.uid; + else + ei->attr.uid = parent->i_uid; + + if (rei->ei.attr.mode & EVENTFS_SAVE_GID) + ei->attr.gid = rei->ei.attr.gid; + else + ei->attr.gid = parent->i_gid; } static void set_top_events_ownership(struct inode *inode) @@ -233,10 +252,10 @@ static void set_top_events_ownership(struct inode *inode) struct eventfs_inode *ei = ti->private; /* The top events directory doesn't get automatically updated */ - if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + if (!ei || !ei->is_events) return; - update_top_events_attr(ei, inode->i_sb); + update_events_attr(ei, inode->i_sb); if (!(ei->attr.mode & EVENTFS_SAVE_UID)) inode->i_uid = ei->attr.uid; @@ -265,7 +284,7 @@ static int eventfs_permission(struct mnt_idmap *idmap, return generic_permission(idmap, inode, mask); } -static const struct inode_operations eventfs_root_dir_inode_operations = { +static const struct inode_operations eventfs_dir_inode_operations = { .lookup = eventfs_root_lookup, .setattr = eventfs_set_attr, .getattr = eventfs_get_attr, @@ -282,6 +301,35 @@ static const struct file_operations eventfs_file_operations = { .llseek = generic_file_llseek, }; +/* + * On a remount of tracefs, if UID or GID options are set, then + * the mount point inode permissions should be used. + * Reset the saved permission flags appropriately. + */ +void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid) +{ + struct eventfs_inode *ei = ti->private; + + if (!ei) + return; + + if (update_uid) + ei->attr.mode &= ~EVENTFS_SAVE_UID; + + if (update_gid) + ei->attr.mode &= ~EVENTFS_SAVE_GID; + + if (!ei->entry_attrs) + return; + + for (int i = 0; i < ei->nr_entries; i++) { + if (update_uid) + ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_UID; + if (update_gid) + ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_GID; + } +} + /* Return the evenfs_inode of the "events" directory */ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) { @@ -304,7 +352,7 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) // Walk upwards until you find the events inode } while (!ei->is_events); - update_top_events_attr(ei, dentry->d_sb); + update_events_attr(ei, dentry->d_sb); return ei; } @@ -410,7 +458,7 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry, update_inode_attr(dentry, inode, &ei->attr, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); - inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_op = &eventfs_dir_inode_operations; inode->i_fop = &eventfs_file_operations; /* All directories will have the same inode number */ @@ -734,7 +782,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode /* Was the parent freed? */ if (list_empty(&ei->list)) { - free_ei(ei); + cleanup_ei(ei); ei = NULL; } return ei; @@ -781,6 +829,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry // Note: we have a ref to the dentry from tracefs_start_creating() rei = get_root_inode(ei); rei->events_dir = dentry; + rei->parent_inode = d_inode(dentry->d_sb->s_root); ei->entries = entries; ei->nr_entries = size; @@ -790,29 +839,26 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry uid = d_inode(dentry->d_parent)->i_uid; gid = d_inode(dentry->d_parent)->i_gid; - /* - * If the events directory is of the top instance, then parent - * is NULL. Set the attr.mode to reflect this and its permissions will - * default to the tracefs root dentry. - */ - if (!parent) - ei->attr.mode = EVENTFS_TOPLEVEL; - - /* This is used as the default ownership of the files and directories */ ei->attr.uid = uid; ei->attr.gid = gid; + /* + * When the "events" directory is created, it takes on the + * permissions of its parent. But can be reset on remount. + */ + ei->attr.mode |= EVENTFS_SAVE_UID | EVENTFS_SAVE_GID; + INIT_LIST_HEAD(&ei->children); INIT_LIST_HEAD(&ei->list); ti = get_tracefs(inode); - ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; + ti->flags |= TRACEFS_EVENT_INODE; ti->private = ei; inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_uid = uid; inode->i_gid = gid; - inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_op = &eventfs_dir_inode_operations; inode->i_fop = &eventfs_file_operations; dentry->d_fsdata = get_ei(ei); @@ -835,7 +881,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - free_ei(ei); + cleanup_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 5545e6bf7d26..417c840e6403 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -30,20 +30,47 @@ static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +/* + * Keep track of all tracefs_inodes in order to update their + * flags if necessary on a remount. + */ +static DEFINE_SPINLOCK(tracefs_inode_lock); +static LIST_HEAD(tracefs_inodes); + static struct inode *tracefs_alloc_inode(struct super_block *sb) { struct tracefs_inode *ti; + unsigned long flags; ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); if (!ti) return NULL; + spin_lock_irqsave(&tracefs_inode_lock, flags); + list_add_rcu(&ti->list, &tracefs_inodes); + spin_unlock_irqrestore(&tracefs_inode_lock, flags); + return &ti->vfs_inode; } +static void tracefs_free_inode_rcu(struct rcu_head *rcu) +{ + struct tracefs_inode *ti; + + ti = container_of(rcu, struct tracefs_inode, rcu); + kmem_cache_free(tracefs_inode_cachep, ti); +} + static void tracefs_free_inode(struct inode *inode) { - kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode)); + struct tracefs_inode *ti = get_tracefs(inode); + unsigned long flags; + + spin_lock_irqsave(&tracefs_inode_lock, flags); + list_del_rcu(&ti->list); + spin_unlock_irqrestore(&tracefs_inode_lock, flags); + + call_rcu(&ti->rcu, tracefs_free_inode_rcu); } static ssize_t default_read_file(struct file *file, char __user *buf, @@ -153,16 +180,39 @@ static void set_tracefs_inode_owner(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); struct inode *root_inode = ti->private; + kuid_t uid; + kgid_t gid; + + uid = root_inode->i_uid; + gid = root_inode->i_gid; + + /* + * If the root is not the mount point, then check the root's + * permissions. If it was never set, then default to the + * mount point. + */ + if (root_inode != d_inode(root_inode->i_sb->s_root)) { + struct tracefs_inode *rti; + + rti = get_tracefs(root_inode); + root_inode = d_inode(root_inode->i_sb->s_root); + + if (!(rti->flags & TRACEFS_UID_PERM_SET)) + uid = root_inode->i_uid; + + if (!(rti->flags & TRACEFS_GID_PERM_SET)) + gid = root_inode->i_gid; + } /* * If this inode has never been referenced, then update * the permissions to the superblock. */ if (!(ti->flags & TRACEFS_UID_PERM_SET)) - inode->i_uid = root_inode->i_uid; + inode->i_uid = uid; if (!(ti->flags & TRACEFS_GID_PERM_SET)) - inode->i_gid = root_inode->i_gid; + inode->i_gid = gid; } static int tracefs_permission(struct mnt_idmap *idmap, @@ -313,6 +363,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; + struct tracefs_inode *ti; + bool update_uid, update_gid; umode_t tmp_mode; /* @@ -332,6 +384,25 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) if (!remount || opts->opts & BIT(Opt_gid)) inode->i_gid = opts->gid; + if (remount && (opts->opts & BIT(Opt_uid) || opts->opts & BIT(Opt_gid))) { + + update_uid = opts->opts & BIT(Opt_uid); + update_gid = opts->opts & BIT(Opt_gid); + + rcu_read_lock(); + list_for_each_entry_rcu(ti, &tracefs_inodes, list) { + if (update_uid) + ti->flags &= ~TRACEFS_UID_PERM_SET; + + if (update_gid) + ti->flags &= ~TRACEFS_GID_PERM_SET; + + if (ti->flags & TRACEFS_EVENT_INODE) + eventfs_remount(ti, update_uid, update_gid); + } + rcu_read_unlock(); + } + return 0; } @@ -398,7 +469,22 @@ static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) return !(ei && ei->is_freed); } +static void tracefs_d_iput(struct dentry *dentry, struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + + /* + * This inode is being freed and cannot be used for + * eventfs. Clear the flag so that it doesn't call into + * eventfs during the remount flag updates. The eventfs_inode + * gets freed after an RCU cycle, so the content will still + * be safe if the iteration is going on now. + */ + ti->flags &= ~TRACEFS_EVENT_INODE; +} + static const struct dentry_operations tracefs_dentry_operations = { + .d_iput = tracefs_d_iput, .d_revalidate = tracefs_d_revalidate, .d_release = tracefs_d_release, }; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 15c26f9aaad4..f704d8348357 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -4,15 +4,18 @@ enum { TRACEFS_EVENT_INODE = BIT(1), - TRACEFS_EVENT_TOP_INODE = BIT(2), - TRACEFS_GID_PERM_SET = BIT(3), - TRACEFS_UID_PERM_SET = BIT(4), - TRACEFS_INSTANCE_INODE = BIT(5), + TRACEFS_GID_PERM_SET = BIT(2), + TRACEFS_UID_PERM_SET = BIT(3), + TRACEFS_INSTANCE_INODE = BIT(4), }; struct tracefs_inode { - struct inode vfs_inode; + union { + struct inode vfs_inode; + struct rcu_head rcu; + }; /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ + struct list_head list; unsigned long flags; void *private; }; @@ -73,6 +76,7 @@ struct dentry *tracefs_end_creating(struct dentry *dentry); struct dentry *tracefs_failed_creating(struct dentry *dentry); struct inode *tracefs_get_inode(struct super_block *sb); +void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid); void eventfs_d_release(struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ |