diff options
Diffstat (limited to 'fs')
567 files changed, 23393 insertions, 15072 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 312393f32948..db5dc1598716 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS source "fs/nfs/Kconfig" source "fs/nfsd/Kconfig" +config GRACE_PERIOD + tristate + config LOCKD tristate depends on FILE_LOCKING + select GRACE_PERIOD config LOCKD_V4 bool @@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT config NFS_COMMON bool - depends on NFSD || NFS_FS + depends on NFSD || NFS_FS || LOCKD default y source "net/sunrpc/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 4030cbfbc9af..90c88529892b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o + stack.o fs_struct.o statfs.o fs_pin.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index c770337c4b45..24575d9d882d 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne extern unsigned int adfs_map_free(struct super_block *sb); /* Misc */ +__printf(3, 4) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...); #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 0d138c0de293..51c279a29845 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf goto out; if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n", + adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n", ADFS_I(inode)->parent_id, dir.parent_id); ret = -EIO; goto free_out; diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d9e3bee4e653..f2ba88ab4aed 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct } size >>= sb->s_blocksize_bits; - if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) { + if (size > ARRAY_SIZE(dir->bh)) { /* this directory is too big for fixed bh set, must allocate */ struct buffer_head **bh_fplus = - kzalloc(size * sizeof(struct buffer_head *), + kcalloc(size, sizeof(struct buffer_head *), GFP_KERNEL); if (!bh_fplus) { adfs_error(sb, "not enough memory for" @@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct dir->bh_fplus[blk] = sb_bread(sb, block); if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %X failed read for" - " offset %d, mapped block %X", - id, blk, block); + adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", + id, blk, block); goto out; } @@ -141,6 +141,7 @@ struct kioctx { struct { unsigned tail; + unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; @@ -192,7 +193,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) } file->f_flags = O_RDWR; - file->private_data = ctx; return file; } @@ -202,7 +202,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); } /* aio_setup @@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work) aio_free_ring(ctx); free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } @@ -554,8 +556,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) @@ -563,7 +564,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -577,8 +577,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } new_nr = (table ? table->nr : 1) * 4; - - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -589,8 +587,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - old = rcu_dereference(mm->ioctx_table); + old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -664,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); @@ -715,8 +712,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - free_percpu(ctx->reqs.pcpu_count); - free_percpu(ctx->users.pcpu_count); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -737,12 +734,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ @@ -791,40 +785,35 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table; - struct kioctx *ctx; - unsigned i = 0; - - while (1) { - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - do { - if (!table || i >= table->nr) { - rcu_read_unlock(); - rcu_assign_pointer(mm->ioctx_table, NULL); - if (table) - kfree(table); - return; - } + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + int i; - ctx = table->table[i++]; - } while (!ctx); + if (!table) + return; - rcu_read_unlock(); + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + if (!ctx) + continue; /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; + kill_ioctx(mm, ctx, &requests_done); - kill_ioctx(mm, ctx, NULL); + /* Wait until all IO for the context are done. */ + wait_for_completion(&requests_done); } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) @@ -832,10 +821,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) struct kioctx_cpu *kcpu; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { @@ -844,7 +831,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) } local_irq_restore(flags); - preempt_enable(); } static bool get_reqs_available(struct kioctx *ctx) @@ -853,10 +839,8 @@ static bool get_reqs_available(struct kioctx *ctx) bool ret = false; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -876,10 +860,71 @@ static bool get_reqs_available(struct kioctx *ctx) kcpu->reqs_available--; out: local_irq_restore(flags); - preempt_enable(); return ret; } +/* refill_reqs_available + * Updates the reqs_available reference counts used for tracking the + * number of free slots in the completion ring. This can be called + * from aio_complete() (to optimistically update reqs_available) or + * from aio_get_req() (the we're out of events case). It must be + * called holding ctx->completion_lock. + */ +static void refill_reqs_available(struct kioctx *ctx, unsigned head, + unsigned tail) +{ + unsigned events_in_ring, completed; + + /* Clamp head since userland can write to it. */ + head %= ctx->nr_events; + if (head <= tail) + events_in_ring = tail - head; + else + events_in_ring = ctx->nr_events - (head - tail); + + completed = ctx->completed_events; + if (events_in_ring < completed) + completed -= events_in_ring; + else + completed = 0; + + if (!completed) + return; + + ctx->completed_events -= completed; + put_reqs_available(ctx, completed); +} + +/* user_refill_reqs_available + * Called to refill reqs_available when aio_get_req() encounters an + * out of space in the completion ring. + */ +static void user_refill_reqs_available(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + if (ctx->completed_events) { + struct aio_ring *ring; + unsigned head; + + /* Access of ring->head may race with aio_read_events_ring() + * here, but that's okay since whether we read the old version + * or the new version, and either will be valid. The important + * part is that head cannot pass tail since we prevent + * aio_complete() from updating tail by holding + * ctx->completion_lock. Even if head is invalid, the check + * against ctx->completed_events below will make sure we do the + * safe/right thing. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + refill_reqs_available(ctx, head, ctx->tail); + } + + spin_unlock_irq(&ctx->completion_lock); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -888,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (!get_reqs_available(ctx)) - return NULL; + if (!get_reqs_available(ctx)) { + user_refill_reqs_available(ctx); + if (!get_reqs_available(ctx)) + return NULL; + } req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) @@ -948,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; + unsigned tail, pos, head; unsigned long flags; - unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -1010,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ctx->tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); + ctx->completed_events++; + if (ctx->completed_events > 1) + refill_reqs_available(ctx, head, tail); spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1028,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); - put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -1045,7 +1096,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } EXPORT_SYMBOL(aio_complete); -/* aio_read_events +/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ @@ -1065,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx, tail = ring->tail; kunmap_atomic(ring); + /* + * Ensure that once we've read the current tail pointer, that + * we also see the events that were stored up to the tail. + */ + smp_rmb(); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) @@ -1268,12 +1325,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (compat) ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, (struct iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); if (ret < 0) return ret; @@ -1297,9 +1354,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, } /* - * aio_setup_iocb: - * Performs the initial checks and aio retry method - * setup for the kiocb at the time of io submission. + * aio_run_iocb: + * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, char __user *buf, bool compat) @@ -1311,7 +1367,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, fmode_t mode; aio_rw_op *rw_op; rw_iter_op *iter_op; - struct iovec inline_vec, *iovec = &inline_vec; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; switch (opcode) { @@ -1346,7 +1402,7 @@ rw_common: if (!ret) ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); return ret; } @@ -1393,7 +1449,7 @@ rw_common: return -EINVAL; } - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); if (ret != -EIOCBQUEUED) { diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index acf32054edd8..9e359fb20c0a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -143,20 +143,6 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -/* Does a dentry have some pending activity? */ -static inline int autofs4_ispending(struct dentry *dentry) -{ - struct autofs_info *inf = autofs4_dentry_ino(dentry); - - if (inf->flags & AUTOFS_INF_PENDING) - return 1; - - if (inf->flags & AUTOFS_INF_EXPIRING) - return 1; - - return 0; -} - struct inode *autofs4_get_inode(struct super_block *, umode_t); void autofs4_free_ino(struct autofs_info *); @@ -191,55 +177,6 @@ extern const struct file_operations autofs4_root_operations; extern const struct dentry_operations autofs4_dentry_operations; /* VFS automount flags management functions */ - -static inline void __managed_dentry_set_automount(struct dentry *dentry) -{ - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; -} - -static inline void managed_dentry_set_automount(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_set_automount(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_clear_automount(struct dentry *dentry) -{ - dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; -} - -static inline void managed_dentry_clear_automount(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_automount(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_set_transit(struct dentry *dentry) -{ - dentry->d_flags |= DCACHE_MANAGE_TRANSIT; -} - -static inline void managed_dentry_set_transit(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_set_transit(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_clear_transit(struct dentry *dentry) -{ - dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT; -} - -static inline void managed_dentry_clear_transit(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_transit(dentry); - spin_unlock(&dentry->d_lock); -} - static inline void __managed_dentry_set_managed(struct dentry *dentry) { dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 394e90b02c5e..a7be57e39be7 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -333,7 +333,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - struct autofs_info *ino = autofs4_dentry_ino(root); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index cc87c1abac97..cdb25ebccc4c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -166,8 +166,10 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&sbi->lookup_lock); head = &sbi->active_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *active; @@ -218,8 +220,10 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *expiring; @@ -373,7 +377,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * this because the leaves of the directory tree under the * mount never trigger mounts themselves (they have an autofs * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to to trigger mounts. In this case we + * do need the leaves to trigger mounts. In this case we * have no choice but to use the list_empty() check and * require user space behave. */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 7c93953030fb..afd2b4408adf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, return -EIO; } -static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { return -EIO; } @@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops = .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, - .rename = bad_inode_rename, + .rename2 = bad_inode_rename2, .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a16fbd4e8241..4cf61ec6b7a8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -799,13 +799,11 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_debug(sb, "---> %s", __func__); -#ifndef CONFIG_BEFS_RW if (!(sb->s_flags & MS_RDONLY)) { befs_warning(sb, "No write support. Marking filesystem read-only"); sb->s_flags |= MS_RDONLY; } -#endif /* CONFIG_BEFS_RW */ /* * Set dummy blocksize to read super block. @@ -834,16 +832,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent) (befs_super_block *) ((void *) bh->b_data + x86_sb_off); } - if (befs_load_sb(sb, disk_sb) != BEFS_OK) + if ((befs_load_sb(sb, disk_sb) != BEFS_OK) || + (befs_check_sb(sb) != BEFS_OK)) goto unacquire_bh; befs_dump_super_block(sb, disk_sb); brelse(bh); - if (befs_check_sb(sb) != BEFS_OK) - goto unacquire_priv_sbp; - if( befs_sb->num_blocks > ~((sector_t)0) ) { befs_error(sb, "blocks count: %llu " "is larger than the host can use", diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index f7f87e233dd9..f40006db36df 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode) /* inode.c */ extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); +extern void bfs_dump_imap(const char *, struct super_block *); /* file.c */ extern const struct inode_operations bfs_file_inops; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index a399e6d9dc74..08063ae0a17c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -extern void dump_imap(const char *, struct super_block *); - static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); - dump_imap("create", s); + bfs_dump_imap("create", s); err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 7041ac35ace8..90bc079d9982 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -30,8 +30,6 @@ MODULE_LICENSE("GPL"); #define dprintf(x...) #endif -void dump_imap(const char *prefix, struct super_block *s); - struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; @@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); - dump_imap("delete_inode", s); + bfs_dump_imap("delete_inode", s); } /* @@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = { .statfs = bfs_statfs, }; -void dump_imap(const char *prefix, struct super_block *s) +void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; @@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) } brelse(bh); brelse(sbh); - dump_imap("read_super", s); + bfs_dump_imap("read_super", s); return 0; out3: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..e2f3ad0879ce 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } +static int blkdev_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); +} + static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, + .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5a201d81049c..4dabeb893b7c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -22,7 +22,6 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> -#include <linux/workqueue.h> #include "async-thread.h" #include "ctree.h" @@ -55,13 +54,45 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static inline struct __btrfs_workqueue -*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +static void normal_work_helper(struct btrfs_work *work); + +#define BTRFS_WORK_HELPER(name) \ +void btrfs_##name(struct work_struct *arg) \ +{ \ + struct btrfs_work *work = container_of(arg, struct btrfs_work, \ + normal_work); \ + normal_work_helper(work); \ +} + +BTRFS_WORK_HELPER(worker_helper); +BTRFS_WORK_HELPER(delalloc_helper); +BTRFS_WORK_HELPER(flush_delalloc_helper); +BTRFS_WORK_HELPER(cache_helper); +BTRFS_WORK_HELPER(submit_helper); +BTRFS_WORK_HELPER(fixup_helper); +BTRFS_WORK_HELPER(endio_helper); +BTRFS_WORK_HELPER(endio_meta_helper); +BTRFS_WORK_HELPER(endio_meta_write_helper); +BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(endio_repair_helper); +BTRFS_WORK_HELPER(rmw_helper); +BTRFS_WORK_HELPER(endio_write_helper); +BTRFS_WORK_HELPER(freespace_write_helper); +BTRFS_WORK_HELPER(delayed_meta_helper); +BTRFS_WORK_HELPER(readahead_helper); +BTRFS_WORK_HELPER(qgroup_rescan_helper); +BTRFS_WORK_HELPER(extent_refs_helper); +BTRFS_WORK_HELPER(scrub_helper); +BTRFS_WORK_HELPER(scrubwrc_helper); +BTRFS_WORK_HELPER(scrubnc_helper); + +static struct __btrfs_workqueue * +__btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->max_active = max_active; @@ -85,7 +116,7 @@ static inline struct __btrfs_workqueue ret->normal_wq = alloc_workqueue("%s-%s", flags, ret->max_active, "btrfs", name); - if (unlikely(!ret->normal_wq)) { + if (!ret->normal_wq) { kfree(ret); return NULL; } @@ -107,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, max_active, thresh); - if (unlikely(!ret->normal)) { + if (!ret->normal) { kfree(ret); return NULL; } @@ -120,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (flags & WQ_HIGHPRI) { ret->high = __btrfs_alloc_workqueue(name, flags, max_active, thresh); - if (unlikely(!ret->high)) { + if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); return NULL; @@ -232,13 +263,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); } -static void normal_work_helper(struct work_struct *arg) +static void normal_work_helper(struct btrfs_work *work) { - struct btrfs_work *work; struct __btrfs_workqueue *wq; int need_order = 0; - work = container_of(arg, struct btrfs_work, normal_work); /* * We should not touch things inside work in the following cases: * 1) after work->func() if it has no ordered_free @@ -262,7 +291,7 @@ static void normal_work_helper(struct work_struct *arg) trace_btrfs_all_work_done(work); } -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free) @@ -270,7 +299,7 @@ void btrfs_init_work(struct btrfs_work *work, work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, normal_work_helper); + INIT_WORK(&work->normal_work, uniq_func); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 9c6b66d15fb0..e386c29ef1f6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -19,12 +19,14 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ +#include <linux/workqueue.h> struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -38,11 +40,36 @@ struct btrfs_work { unsigned long flags; }; +#define BTRFS_WORK_HELPER_PROTO(name) \ +void btrfs_##name(struct work_struct *arg) + +BTRFS_WORK_HELPER_PROTO(worker_helper); +BTRFS_WORK_HELPER_PROTO(delalloc_helper); +BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); +BTRFS_WORK_HELPER_PROTO(cache_helper); +BTRFS_WORK_HELPER_PROTO(submit_helper); +BTRFS_WORK_HELPER_PROTO(fixup_helper); +BTRFS_WORK_HELPER_PROTO(endio_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); +BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(endio_repair_helper); +BTRFS_WORK_HELPER_PROTO(rmw_helper); +BTRFS_WORK_HELPER_PROTO(endio_write_helper); +BTRFS_WORK_HELPER_PROTO(freespace_write_helper); +BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); +BTRFS_WORK_HELPER_PROTO(readahead_helper); +BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); +BTRFS_WORK_HELPER_PROTO(extent_refs_helper); +BTRFS_WORK_HELPER_PROTO(scrub_helper); +BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); +BTRFS_WORK_HELPER_PROTO(scrubnc_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e25564bfcb46..2d3e32ebfd15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -25,6 +25,9 @@ #include "delayed-ref.h" #include "locking.h" +/* Just an arbitrary number so we can be sure this happened */ +#define BACKREF_FOUND_SHARED 6 + struct extent_inode_elem { u64 inum; u64 offset; @@ -276,9 +279,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (ret > 0) goto next; - ret = ulist_add_merge(parents, eb->start, - (uintptr_t)eie, - (u64 *)&old, GFP_NOFS); + ret = ulist_add_merge_ptr(parents, eb->start, + eie, (void **)&old, GFP_NOFS); if (ret < 0) break; if (!ret && extent_item_pos) { @@ -378,7 +380,8 @@ out: static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct list_head *head, - const u64 *extent_item_pos, u64 total_refs) + const u64 *extent_item_pos, u64 total_refs, + u64 root_objectid) { int err; int ret = 0; @@ -403,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } err = __resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos, total_refs); @@ -483,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, continue; BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, - fs_info->tree_root->leafsize, 0); + 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -562,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs, u64 *total_refs) + struct list_head *prefs, u64 *total_refs, + u64 inum) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; @@ -626,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.objectid = ref->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = ref->offset; + + /* + * Found a inum that doesn't match our known inum, we + * know it's shared. + */ + if (inum && ref->objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); @@ -660,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct list_head *prefs, - u64 *total_refs) + u64 *total_refs, u64 inum) { int ret = 0; int slot; @@ -745,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, count, GFP_NOFS); @@ -766,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, */ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - int info_level, struct list_head *prefs) + int info_level, struct list_head *prefs, u64 inum) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -828,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, count, GFP_NOFS); @@ -855,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos) + struct ulist *roots, const u64 *extent_item_pos, + u64 root_objectid, u64 inum) { struct btrfs_key key; struct btrfs_path *path; @@ -930,7 +961,8 @@ again: } spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, - &prefs_delayed, &total_refs); + &prefs_delayed, &total_refs, + inum); mutex_unlock(&head->mutex); if (ret) goto out; @@ -952,11 +984,11 @@ again: key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, &info_level, &prefs, - &total_refs); + &total_refs, inum); if (ret) goto out; ret = __add_keyed_refs(fs_info, path, bytenr, - info_level, &prefs); + info_level, &prefs, inum); if (ret) goto out; } @@ -972,7 +1004,8 @@ again: __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos, total_refs); + extent_item_pos, total_refs, + root_objectid); if (ret) goto out; @@ -982,6 +1015,11 @@ again: ref = list_first_entry(&prefs, struct __prelim_ref, list); WARN_ON(ref->count < 0); if (roots && ref->count && ref->root_id && ref->parent == 0) { + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } + /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) @@ -990,27 +1028,28 @@ again: if (ref->count && ref->parent) { if (extent_item_pos && !ref->inode_list && ref->level == 0) { - u32 bsz; struct extent_buffer *eb; - bsz = btrfs_level_size(fs_info->extent_root, - ref->level); + eb = read_tree_block(fs_info->extent_root, - ref->parent, bsz, 0); + ref->parent, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; ref->inode_list = eie; } - ret = ulist_add_merge(refs, ref->parent, - (uintptr_t)ref->inode_list, - (u64 *)&eie, GFP_NOFS); + ret = ulist_add_merge_ptr(refs, ref->parent, + ref->inode_list, + (void **)&eie, GFP_NOFS); if (ret < 0) goto out; if (!ret && extent_item_pos) { @@ -1085,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, *leafs, NULL, extent_item_pos); + time_seq, *leafs, NULL, extent_item_pos, 0, 0); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1128,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, tmp, *roots, NULL); + time_seq, tmp, *roots, NULL, 0, 0); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1159,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr) +{ + struct ulist *tmp = NULL; + struct ulist *roots = NULL; + struct ulist_iterator uiter; + struct ulist_node *node; + struct seq_list elem = {}; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + roots = ulist_alloc(GFP_NOFS); + if (!tmp || !roots) { + ulist_free(tmp); + ulist_free(roots); + return -ENOMEM; + } + + if (trans) + btrfs_get_tree_mod_seq(fs_info, &elem); + else + down_read(&fs_info->commit_root_sem); + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, + roots, NULL, root_objectid, inum); + if (ret == BACKREF_FOUND_SHARED) { + ret = 1; + break; + } + if (ret < 0 && ret != -ENOENT) + break; + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + cond_resched(); + } + if (trans) + btrfs_put_tree_mod_seq(fs_info, &elem); + else + up_read(&fs_info->commit_root_sem); + ulist_free(tmp); + ulist_free(roots); + return ret; +} + /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1191,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, unsigned long ptr; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = start_off; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1231,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, ret = -ENOENT; if (found_key.objectid != inode_objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != BTRFS_INODE_EXTREF_KEY) break; ret = 0; @@ -1364,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) - size = fs_info->extent_root->leafsize; + size = fs_info->extent_root->nodesize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 86fc20fec282..2a1ac6bfc724 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr); int __init btrfs_prelim_ref_init(void); void btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4794923c410c..4aadadcfab20 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,17 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* + * The following 3 bits are meant only for the btree inode. + * When any of them is set, it means an error happened while writing an + * extent buffer belonging to: + * 1) a non-log btree + * 2) a log btree and first log sub-transaction + * 3) a log btree and second log sub-transaction + */ +#define BTRFS_INODE_BTREE_ERR 12 +#define BTRFS_INODE_BTREE_LOG1_ERR 13 +#define BTRFS_INODE_BTREE_LOG2_ERR 14 /* in memory btrfs inode */ struct btrfs_inode { @@ -84,12 +95,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; @@ -127,6 +132,12 @@ struct btrfs_inode { u64 delalloc_bytes; /* + * total number of bytes pending defrag, used by stat to check whether + * it needs COW. + */ + u64 defrag_bytes; + + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk * because not all the blocks are written yet. @@ -240,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && BTRFS_I(inode)->last_sub_trans <= - BTRFS_I(inode)->root->last_log_commit) - return 1; + BTRFS_I(inode)->root->last_log_commit) { + /* + * After a ranged fsync we might have left some extent maps + * (that fall outside the fsync's range). So return false + * here if the list isn't empty, to make sure btrfs_log_inode() + * will be called and process those extent maps. + */ + smp_mb(); + if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) + return 1; + } return 0; } +#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_dio_private { struct inode *inode; + unsigned long flags; u64 logical_offset; u64 disk_bytenr; u64 bytes; @@ -263,7 +286,12 @@ struct btrfs_dio_private { /* dio_bio came from fs/direct-io.c */ struct bio *dio_bio; - u8 csum[0]; + + /* + * The original bio may be splited to several sub-bios, this is + * done during endio of sub-bios + */ + int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce92ae30250f..cb7f3fe9c9f6 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; bh = __bread(superblock_bdev, dev_bytenr / 4096, BTRFS_SUPER_INFO_SIZE); @@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_leafsize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; @@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data( while (len > 0) { cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); - BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE)); kaddr = block_ctx->datav[i]; memcpy(dst, kaddr + offset_in_page, cur); @@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize != root->leafsize) { - printk(KERN_INFO - "btrfsic: cannot handle nodesize %d != leafsize %d!\n", - root->nodesize, root->leafsize); - return -1; - } if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", root->nodesize, PAGE_CACHE_SIZE); return -1; } - if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { - printk(KERN_INFO - "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->leafsize, PAGE_CACHE_SIZE); - return -1; - } if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1daea0b47187..d3220d31d3cb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root, u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); return sizeof(struct compressed_bio) + - ((disk_size + root->sectorsize - 1) / root->sectorsize) * - csum_size; + (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size; } static struct bio *compressed_bio_alloc(struct block_device *bdev, @@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * freed before we're done setting it up */ atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; - nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); if (!cb->compressed_pages) @@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_CACHE_SIZE) { bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ /* @@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_iter.bi_size + - root->sectorsize - 1) / root->sectorsize; + sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + root->sectorsize); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); @@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index aeab453b8e24..19bc6162fb8e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -280,9 +279,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -1035,14 +1034,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); + ret = btrfs_inc_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); + ret = btrfs_dec_ref(trans, root, buf, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -1050,9 +1049,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { @@ -1069,11 +1068,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); + ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); @@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else parent_start = 0; - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; - u32 blocksize; eb_root = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); @@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - blocksize = btrfs_level_size(root, old_root->level); - old = read_tree_block(root, logical, blocksize, 0); + old = read_tree_block(root, logical, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); btrfs_warn(root->fs_info, @@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + /* ensure we can see the force_cow */ smp_rmb(); @@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); + blocksize = root->nodesize; end_slot = parent_nritems; if (parent_nritems == 1) @@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr, blocksize); + cur = btrfs_find_tree_block(root, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); + cur = read_tree_block(root, blocknr, gen); if (!cur || !extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; @@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(parent, slot)); if (eb && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root, node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); - eb = btrfs_find_tree_block(root, search, blocksize); + blocksize = root->nodesize; + eb = btrfs_find_tree_block(root, search); if (eb) { free_extent_buffer(eb); return; @@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); + readahead_tree_block(root, search, blocksize); nread += blocksize; } nscan++; @@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); + eb = btrfs_find_tree_block(root, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); + eb = btrfs_find_tree_block(root, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } if (block1) - readahead_tree_block(root, block1, blocksize, 0); + readahead_tree_block(root, block1, blocksize); if (block2) - readahead_tree_block(root, block2, blocksize, 0); + readahead_tree_block(root, block2, blocksize); } @@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans, { u64 blocknr; u64 gen; - u32 blocksize; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; int ret; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); - tmp = btrfs_find_tree_block(root, blocknr, blocksize); + tmp = btrfs_find_tree_block(root, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); + tmp = read_tree_block(root, blocknr, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -2792,8 +2784,6 @@ again: if (!should_cow_block(trans, root, b)) goto cow_done; - btrfs_set_path_blocking(p); - /* * must have write locks on this node and the * parent @@ -2807,6 +2797,7 @@ again: goto again; } + btrfs_set_path_blocking(p); err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0); + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0); + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4282,13 +4271,12 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->leafsize); + root_add_used(root, root->nodesize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, ptr = btrfs_item_ptr_offset(leaf, slot); memmove_extent_buffer(leaf, ptr, (unsigned long)fi, - offsetof(struct btrfs_file_extent_item, - disk_bytenr)); + BTRFS_FILE_EXTENT_INLINE_DATA_START); } } @@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + fixup_low_keys(root, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; + int keep_locks = path->keep_locks; - WARN_ON(!path->keep_locks); + path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -5210,7 +5198,6 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -5225,9 +5212,12 @@ find_next_key: btrfs_clear_path_blocking(path, NULL, 0); } out: - if (ret == 0) + path->keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); + } return ret; } @@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); if (!tmp_buf) { ret = -ENOMEM; goto out; @@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { - enum btrfs_compare_tree_result cmp; + enum btrfs_compare_tree_result result; WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); if (ret) - cmp = BTRFS_COMPARE_TREE_CHANGED; + result = BTRFS_COMPARE_TREE_CHANGED; else - cmp = BTRFS_COMPARE_TREE_SAME; + result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_root, right_root, left_path, right_path, - &left_key, cmp, ctx); + &left_key, result, ctx); if (ret < 0) goto out; advance_left = ADVANCE; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index be91397f4e92..d557264ee974 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/workqueue.h> +#include <linux/security.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -62,13 +63,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* - * files bigger than this get some pre-flushing when they are added - * to the ordered operations list. That way we limit the total - * work done by the commit - */ -#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) - /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -391,10 +385,12 @@ struct btrfs_header { sizeof(struct btrfs_header)) / \ sizeof(struct btrfs_key_ptr)) #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ - sizeof(struct btrfs_file_extent_item)) + BTRFS_FILE_EXTENT_INLINE_DATA_START) #define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) -\ sizeof(struct btrfs_dir_item)) @@ -474,7 +470,7 @@ struct btrfs_super_block { __le64 num_devices; __le32 sectorsize; __le32 nodesize; - __le32 leafsize; + __le32 __unused_leafsize; __le32 stripesize; __le32 sys_chunk_array_size; __le64 chunk_root_generation; @@ -903,6 +899,8 @@ struct btrfs_file_extent_item { /* * disk space consumed by the extent, checksum blocks are included * in these numbers + * + * At this offset in the structure, the inline extent data start. */ __le64 disk_bytenr; __le64 disk_num_bytes; @@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache { */ struct list_head cluster_list; - /* For delayed block group creation */ - struct list_head new_bg_list; + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; }; /* delayed seq elem */ @@ -1545,6 +1543,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; @@ -1574,6 +1573,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int open; u64 total_pinned; @@ -1723,6 +1723,12 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + + /* For btrfs to record security options */ + struct security_mnt_opts security_opts; }; struct btrfs_subvolume_writers { @@ -1776,12 +1782,12 @@ struct btrfs_root { /* free ino cache stuff */ struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type cached; - spinlock_t cache_lock; - wait_queue_head_t cache_wait; + enum btrfs_caching_type ino_cache_state; + spinlock_t ino_cache_lock; + wait_queue_head_t ino_cache_wait; struct btrfs_free_space_ctl *free_ino_pinned; - u64 cache_progress; - struct inode *cache_inode; + u64 ino_cache_progress; + struct inode *ino_cache_inode; struct mutex log_mutex; wait_queue_head_t log_writer_wait; @@ -1806,18 +1812,14 @@ struct btrfs_root { /* node allocations are done in nodesize units */ u32 nodesize; - /* leaf allocations are done in leafsize units */ - u32 leafsize; - u32 stripesize; u32 type; u64 highest_objectid; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; -#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (8192) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, - leafsize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, @@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) { - unsigned long offset = (unsigned long)e; - offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); - return offset; + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) { - return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, @@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, struct btrfs_item *e) { - unsigned long offset; - offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); - return btrfs_item_size(eb, e) - offset; + return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* this returns the number of file bytes represented by the inline item. @@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -static inline u32 btrfs_level_size(struct btrfs_root *root, int level) -{ - if (level == 0) - return root->leafsize; - return root->nodesize; -} - /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2 * num_items; } @@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - num_items; + return root->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, @@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int get_block_group_index(struct btrfs_block_group_cache *cache); -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, @@ -3326,9 +3315,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, @@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); @@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->uuid_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + security_free_mnt_opts(&fs_info->security_opts); kfree(fs_info); } @@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 logical_offset); + struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, - u64 rfer, u64 excl); #endif +static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 1; +#endif + return 0; +} + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index da775bfdebc9..054577bddaf2 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; key.objectid = node->inode_id; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) @@ -1099,7 +1099,7 @@ err_out: search: btrfs_release_path(path); - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, - NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, + btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); @@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } delayed_item->key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); + delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; @@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, return PTR_ERR(node); item_key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); + item_key.type = BTRFS_DIR_INDEX_KEY; item_key.offset = index; ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index eea26e1b2fda..6f662b34ba0e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->total_bytes; dev_replace->tgtdev->disk_total_bytes = dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; } dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; btrfs_init_dev_replace_tgtdev_for_resume(fs_info, @@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - mutex_lock(&fs_info->volume_mutex); - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, - &tgt_device); - if (ret) { - btrfs_err(fs_info, "target device %s is invalid!", - args->start.tgtdev_name); - mutex_unlock(&fs_info->volume_mutex); - return -EINVAL; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); } + /* the disk copy procedure reuses the scrub code */ + mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, &src_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) { - ret = -EINVAL; - goto leave_no_lock; + mutex_unlock(&fs_info->volume_mutex); + return ret; } - if (tgt_device->total_bytes < src_device->total_bytes) { - btrfs_err(fs_info, "target device is smaller than source device!"); - ret = -EINVAL; - goto leave_no_lock; - } + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + src_device, &tgt_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) + return ret; btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; - /* * from now on, the writes to the srcdev are all duplicated to * go to the tgtdev as well (refer to btrfs_map_block()). @@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, /* the disk copy procedure reuses the scrub code */ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, - src_device->total_bytes, + btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); @@ -426,9 +430,7 @@ leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; btrfs_dev_replace_unlock(dev_replace); -leave_no_lock: - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } @@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ret = btrfs_commit_transaction(trans, root); WARN_ON(ret); + mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED @@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device->is_tgtdev_for_dev_replace = 0; tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; - tgt_device->bytes_used = src_device->bytes_used; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; + tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + fs_info->fs_devices->rw_devices++; /* replace the sysfs entry */ btrfs_kobj_rm_device(fs_info, src_device); btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_dev_replace_unlock(dev_replace); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); @@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - btrfs_dev_replace_unlock(dev_replace); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *srcdev; btrfs_dev_replace_lock(dev_replace); /* even if !dev_replace_is_valid, the values are good enough for @@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + srcdev = dev_replace->srcdev; args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(dev_replace->srcdev->total_bytes, 1000)); + div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, - dev_replace->srcdev->total_bytes, + btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a0691df5dcea..fc8df866e919 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; @@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root u32 data_size; key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); path = btrfs_alloc_path(); @@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); @@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, return -ENOMEM; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = objectid; ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); @@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..fa45e3cae40d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -39,7 +39,6 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" -#include "async-thread.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" @@ -60,8 +59,6 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -75,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root); static void btrfs_error_commit_super(struct btrfs_root *root); /* - * end_io_wq structs are used to do processing in task context when an IO is - * complete. This is used during reads to verify checksums, and it is used + * btrfs_end_io_wq structs are used to do processing in task context when an IO + * is complete. This is used during reads to verify checksums, and it is used * by writes to insert metadata for new file extents after IO is complete. */ -struct end_io_wq { +struct btrfs_end_io_wq { struct bio *bio; bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; int error; - int metadata; + enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; }; +static struct kmem_cache *btrfs_end_io_wq_cache; + +int __init btrfs_end_io_wq_init(void) +{ + btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", + sizeof(struct btrfs_end_io_wq), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_end_io_wq_cache) + return -ENOMEM; + return 0; +} + +void btrfs_end_io_wq_exit(void) +{ + if (btrfs_end_io_wq_cache) + kmem_cache_destroy(btrfs_end_io_wq_cache); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -330,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == - (void *)BTRFS_SEND_TRANS_STUB); + bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -351,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited("parent transid verify failed on %llu wanted %llu " - "found %llu\n", - eb->start, parent_transid, btrfs_header_generation(eb)); + printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + eb->fs_info->sb->s_id, eb->start, + parent_transid, btrfs_header_generation(eb)); ret = 1; /* @@ -610,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { ret = -EIO; goto err; } found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " "%llu %llu\n", - found_start, eb->start); + eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", - eb->start); + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } @@ -683,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -693,52 +709,55 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) static void end_workqueue_bio(struct bio *bio, int err) { - struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->error = err; - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_work(fs_info->endio_meta_write_workers, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_work(fs_info->endio_freespace_worker, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_write_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + wq = fs_info->endio_meta_write_workers; + func = btrfs_endio_meta_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else if (end_io_wq->metadata) - btrfs_queue_work(fs_info->endio_meta_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_workers, - &end_io_wq->work); + if (unlikely(end_io_wq->metadata == + BTRFS_WQ_ENDIO_DIO_REPAIR)) { + wq = fs_info->endio_repair_workers; + func = btrfs_endio_repair_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else if (end_io_wq->metadata) { + wq = fs_info->endio_meta_workers; + func = btrfs_endio_meta_helper; + } else { + wq = fs_info->endio_workers; + func = btrfs_endio_helper; + } } + + btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_queue_work(wq, &end_io_wq->work); } -/* - * For the metadata arg you want - * - * 0 - if data - * 1 - if normal metadta - * 2 - if writing to the free space cache area - * 3 - raid parity work - */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata) + enum btrfs_wq_endio_type metadata) { - struct end_io_wq *end_io_wq; - end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + struct btrfs_end_io_wq *end_io_wq; + + end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) return -ENOMEM; @@ -830,7 +849,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - btrfs_init_work(&async->work, run_one_async_start, + btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); async->bio_flags = bio_flags; @@ -922,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * can happen in the async kernel threads */ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); + bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, @@ -1054,20 +1073,17 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - int ret = 0; buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) - return 0; + return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); - return ret; } int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1103,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, } struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { return find_extent_buffer(root->fs_info, bytenr); } @@ -1111,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return alloc_test_extent_buffer(root->fs_info, bytenr, blocksize); -#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1133,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid) + u64 parent_transid) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return NULL; @@ -1180,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -1197,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) kfree(writers); } -static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, +static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, + struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; - root->leafsize = leafsize; root->stripesize = stripesize; root->state = 0; root->orphan_cleanup_state = 0; @@ -1292,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) root = btrfs_alloc_root(NULL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + __setup_root(4096, 4096, 4096, root, NULL, 1); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -1315,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1393,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, BTRFS_TREE_LOG_OBJECTID); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, + BTRFS_TREE_LOG_OBJECTID); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -1410,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, * updated (along with back refs to the log tree). */ - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, + NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1462,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1482,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; u64 generation; - u32 blocksize; int ret; path = btrfs_alloc_path(); @@ -1495,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, goto alloc_fail; } - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, key->objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, key->objectid); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); @@ -1508,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); + generation); if (!root->node) { ret = -ENOMEM; goto find_fail; @@ -1570,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) root->subv_writers = writers; btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); + spin_lock_init(&root->ino_cache_lock); + init_waitqueue_head(&root->ino_cache_wait); ret = get_anon_bdev(&root->anon_dev); if (ret) @@ -1705,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -/* - * If this fails, caller must call bdi_destroy() to get rid of the - * bdi again. - */ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; @@ -1731,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; - struct end_io_wq *end_io_wq; + struct btrfs_end_io_wq *end_io_wq; int error; - end_io_wq = container_of(work, struct end_io_wq, work); + end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio_nodec(bio, error); } @@ -1769,6 +1771,7 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); + btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2060,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2140,8 +2144,6 @@ int open_ctree(struct super_block *sb, { u32 sectorsize; u32 nodesize; - u32 leafsize; - u32 blocksize; u32 stripesize; u64 generation; u64 features; @@ -2185,7 +2187,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2193,13 +2195,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; @@ -2230,6 +2232,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2239,6 +2242,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->unused_bgs); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); @@ -2257,7 +2261,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; - fs_info->max_inline = 8192 * 1024; + fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->free_chunk_space = 0; @@ -2386,7 +2390,7 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - __setup_root(4096, 4096, 4096, 4096, tree_root, + __setup_root(4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); @@ -2466,19 +2470,22 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) != + /* + * Leafsize and nodesize were always equal, this is only a sanity check. + */ + if (le32_to_cpu(disk_super->__unused_leafsize) != btrfs_super_nodesize(disk_super)) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksizes don't match. node %d leaf %d\n", btrfs_super_nodesize(disk_super), - btrfs_super_leafsize(disk_super)); + le32_to_cpu(disk_super->__unused_leafsize)); err = -EINVAL; goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksize (%d) was too large\n", - btrfs_super_leafsize(disk_super)); + btrfs_super_nodesize(disk_super)); err = -EINVAL; goto fail_alloc; } @@ -2495,17 +2502,16 @@ int open_ctree(struct super_block *sb, * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ - if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); - fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* @@ -2513,7 +2519,7 @@ int open_ctree(struct super_block *sb, * extent buffers for the same range. It leads to corruptions */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != leafsize)) { + (sectorsize != nodesize)) { printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); @@ -2576,6 +2582,8 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue("rmw", flags, max_active, 2); fs_info->endio_write_workers = @@ -2597,11 +2605,12 @@ int open_ctree(struct super_block *sb, fs_info->submit_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->fixup_workers && fs_info->extent_workers && + fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; @@ -2612,7 +2621,6 @@ int open_ctree(struct super_block *sb, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; - tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; tree_root->stripesize = stripesize; @@ -2639,16 +2647,14 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - blocksize = btrfs_level_size(tree_root, - btrfs_super_chunk_root_level(disk_super)); generation = btrfs_super_chunk_root_generation(disk_super); - __setup_root(nodesize, leafsize, sectorsize, stripesize, - chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + __setup_root(nodesize, sectorsize, stripesize, chunk_root, + fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), - blocksize, generation); + generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", @@ -2681,13 +2687,11 @@ int open_ctree(struct super_block *sb, } retry_root_backup: - blocksize = btrfs_level_size(tree_root, - btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), - blocksize, generation); + generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", @@ -2856,9 +2860,6 @@ retry_root_backup: err = -EIO; goto fail_qgroup; } - blocksize = - btrfs_level_size(tree_root, - btrfs_super_log_root_level(disk_super)); log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { @@ -2866,11 +2867,10 @@ retry_root_backup: goto fail_qgroup; } - __setup_root(nodesize, leafsize, sectorsize, stripesize, + __setup_root(nodesize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(tree_root, bytenr, - blocksize, generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { @@ -2977,6 +2977,8 @@ retry_root_backup: fs_info->update_uuid_tree_gen = 1; } + fs_info->open = 1; + return 0; fail_qgroup: @@ -3136,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device, for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) break; if (wait) { @@ -3452,8 +3455,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_generation(dev_item, 0); btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); - btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); - btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_total_bytes(dev_item, + dev->commit_total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); @@ -3528,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, static void free_fs_root(struct btrfs_root *root) { - iput(root->cache_inode); + iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -3619,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root) return btrfs_commit_transaction(trans, root); } -int close_ctree(struct btrfs_root *root) +void close_ctree(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3685,6 +3690,7 @@ int close_ctree(struct btrfs_root *root) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + fs_info->open = 0; free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); @@ -3707,8 +3713,6 @@ int close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; - - return 0; } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -3810,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + struct btrfs_super_block *sb = fs_info->super_copy; + int ret = 0; + + if (sb->root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", + sb->root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", + sb->chunk_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->log_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", + sb->log_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + /* - * Placeholder for checks + * The common minimum, we don't know if we can trust the nodesize/sectorsize + * items yet, they'll be verified later. Issue just a warning. */ - return 0; + if (!IS_ALIGNED(sb->root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->root); + if (!IS_ALIGNED(sb->chunk_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->chunk_root); + if (!IS_ALIGNED(sb->log_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->log_root); + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (sb->num_devices > (1UL << 31)) + printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", + sb->num_devices); + + if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { + printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", + sb->bytenr, BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (sb->generation < sb->chunk_root_generation) + printk(KERN_WARNING + "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", + sb->generation, sb->chunk_root_generation); + if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) + printk(KERN_WARNING + "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", + sb->generation, sb->cache_generation); + + return ret; } static void btrfs_error_commit_super(struct btrfs_root *root) @@ -3829,34 +3896,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - - list_splice_init(&t->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - list_del_init(&btrfs_inode->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - btrfs_invalidate_inodes(btrfs_inode->root); - - spin_lock(&root->fs_info->ordered_root_lock); - } - - spin_unlock(&root->fs_info->ordered_root_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4033,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start, - root->leafsize); - start += root->leafsize; + eb = btrfs_find_tree_block(root, start); + start += root->nodesize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); @@ -4093,8 +4131,6 @@ again: void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { - btrfs_destroy_ordered_operations(cur_trans, root); - btrfs_destroy_delayed_refs(cur_trans, root); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 23ce3ceba0a9..414651821fb3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,11 +25,12 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 -enum { +enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA = 0, BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, BTRFS_WQ_ENDIO_RAID56 = 3, + BTRFS_WQ_ENDIO_DIO_REPAIR = 4, }; static inline u64 btrfs_sb_offset(int mirror) @@ -44,9 +45,8 @@ struct btrfs_device; struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid); -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid); + u64 parent_transid); +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans, int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -int close_ctree(struct btrfs_root *root); +void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); @@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata); + enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, @@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info); +int __init btrfs_end_io_wq_init(void); +void btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 41422a3de8ed..37d164540c3a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); key.objectid = root_objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root, NULL); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f362f9..d56589571012 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -491,7 +491,7 @@ next: key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->leafsize; + fs_info->tree_root->nodesize; else last = key.objectid + key.offset; @@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -764,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, * different */ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->leafsize; + offset = root->nodesize; metadata = 0; } @@ -798,13 +799,13 @@ again: path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == root->leafsize) + key.offset == root->nodesize) ret = 0; } if (ret) { key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; + key.offset = root->nodesize; btrfs_release_path(path); goto again; } @@ -2650,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); num_heads = heads_to_leaves(root, num_heads); if (num_heads > 1) - num_bytes += (num_heads - 1) * root->leafsize; + num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; global_rsv = &root->fs_info->global_block_rsv; @@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->sync = 0; init_completion(&async->wait); - btrfs_init_work(&async->work, delayed_ref_async_start, - NULL, NULL); + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); btrfs_queue_work(root->fs_info->extent_workers, &async->work); @@ -3057,7 +3058,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int no_quota) + int full_backref, int inc) { u64 bytenr; u64 num_bytes; @@ -3072,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); @@ -3096,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, for (i = 0; i < nritems; i++) { if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); @@ -3111,15 +3112,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, no_quota); + key.offset, 1); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); + num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - no_quota); + 1); if (ret) goto fail; } @@ -3130,15 +3131,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3493,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; @@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) */ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - u64 num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; u64 tmp; @@ -4348,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, } static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info) + struct btrfs_fs_info *fs_info, + int flush_state) { u64 used; spin_lock(&space_info->lock); + /* + * We run out of space and have not got any free space via flush_space, + * so don't bother doing async reclaim. + */ + if (flush_state > COMMIT_TRANS && space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4385,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + if (!btrfs_need_do_async_reclaim(space_info, fs_info, + flush_state)) return; } while (flush_state <= COMMIT_TRANS); - if (btrfs_need_do_async_reclaim(space_info, fs_info)) + if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) queue_work(system_unbound_wq, work); } @@ -4507,7 +4513,13 @@ again: space_info->flush = 1; } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; - if (need_do_async_reclaim(space_info, root->fs_info, used) && + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!root->fs_info->log_root_recovering && + need_do_async_reclaim(space_info, root->fs_info, used) && !work_busy(&root->fs_info->async_reclaim_work)) queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); @@ -4844,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) if (num_bytes * 3 > meta_used) num_bytes = div64_u64(meta_used, 3); - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4993,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->leafsize; + num_bytes = 3 * root->nodesize; ret = btrfs_qgroup_reserve(root, num_bytes); if (ret) return ret; @@ -5181,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5190,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (unlikely(ret)) { if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); goto out_fail; } @@ -5306,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { btrfs_qgroup_free(root, num_bytes + - dropped * root->leafsize); + dropped * root->nodesize); } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, @@ -5427,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + + /* + * No longer have used bytes in this block group, queue + * it for deletion. + */ + if (old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; cache->space_info->bytes_pinned += num_bytes; @@ -6238,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6268,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, - struct btrfs_block_group_cache *cache, - u64 val, u64 num_bytes) -{ - u64 ret = ALIGN(val, root->stripesize); - return ret; -} - /* * when we wait for progress in the block group caching, its because * our allocation attempt failed at least once. So, we must sleep @@ -6469,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool have_caching_bg = false; WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -6756,8 +6773,7 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, block_group, - offset, num_bytes); + search_start = ALIGN(offset, root->stripesize); /* move on to the next group */ if (search_start + num_bytes > @@ -7082,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); return -ENOMEM; } @@ -7091,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins, size); if (ret) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); btrfs_free_path(path); return ret; } @@ -7106,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = root->leafsize; + num_bytes = root->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -7136,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->leafsize, 1); + ret = update_block_group(root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); return ret; } @@ -7218,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXENT bit to differentiate dirty pages. */ - if (root->log_transid % 2 == 0) + if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } else { + buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } @@ -7305,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * * returns the tree buffer or NULL. */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) @@ -7316,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; + u32 blocksize = root->nodesize; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, blocksize, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } -#endif + block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -7422,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); + blocksize = root->nodesize; for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) @@ -7469,15 +7487,224 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); - if (ret) - break; + readahead_tree_block(root, bytenr, blocksize); nread++; } wc->reada_slot = slot; } +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + bytenr, num_bytes, + BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + child_bytenr, + root->nodesize, + BTRFS_QGROUP_OPER_SUB_SUBTREE, + 0); + if (ret) + goto out; + + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + /* * helper to process tree block while walking down the tree. * @@ -7532,9 +7759,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, @@ -7581,6 +7808,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, int level = wc->level; int reada = 0; int ret = 0; + bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -7596,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr, blocksize); + next = btrfs_find_tree_block(root, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) @@ -7626,6 +7854,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + need_account = true; if (level == 1 && (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) goto skip; @@ -7659,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); + next = read_tree_block(root, bytenr, generation); if (!next || !extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; @@ -7689,6 +7918,16 @@ skip: parent = 0; } + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -7769,12 +8008,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 1); else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -7900,6 +8144,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -8025,6 +8271,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } + /* + * Qgroup update accounting is run from + * delayed ref handling. This usually works + * out because delayed refs are normally the + * only way qgroup updates are added. However, + * we may have added updates during our tree + * walk so run qgroups here to make sure we + * don't lose any updates. + */ + ret = btrfs_delayed_qgroup_accounting(trans, + root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8078,6 +8342,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: + ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -8181,13 +8453,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (stripped) return extended_to_chunk(stripped); - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + num_devices = root->fs_info->fs_devices->rw_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | @@ -8605,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } up_write(&info->commit_root_sem); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -8739,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); return cache; @@ -8761,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) root = info->extent_root; key.objectid = 0; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -8880,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) + if (btrfs_chunk_readonly(root, cache->key.objectid)) { set_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -8922,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, - new_bg_list) { - list_del_init(&block_group->new_bg_list); - + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + list_del_init(&block_group->bg_list); if (ret) continue; @@ -9011,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - list_add_tail(&cache->new_bg_list, &trans->new_bgs); + list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -9165,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); - btrfs_clear_space_info_full(root->fs_info); - btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9182,6 +9464,101 @@ out: return ret; } +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + space_info = block_group->space_info; + list_del_init(&block_group->bg_list); + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = set_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_set_block_group_rw(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + block_group->pinned = 0; + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); + btrfs_end_transaction(trans, root); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -9313,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) int btrfs_start_nocow_write(struct btrfs_root *root) { - if (unlikely(atomic_read(&root->will_be_snapshoted))) + if (atomic_read(&root->will_be_snapshoted)) return 0; percpu_counter_inc(&root->subv_writers->counter); @@ -9321,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) * Make sure counter is updated before we check for snapshot creation. */ smp_mb(); - if (unlikely(atomic_read(&root->will_be_snapshoted))) { + if (atomic_read(&root->will_be_snapshoted)) { btrfs_end_nocow_write(root); return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820d158b..bf3f424e0013 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; static struct bio_set *btrfs_bioset; +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); @@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - state->start, state->end, state->state, state->tree, + pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), atomic_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); @@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; state->state = 0; state->private = 0; - state->tree = NULL; + RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); @@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { - WARN_ON(state->tree); + WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); @@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree, other->state == state->state) { merge_cb(tree, state, other); state->start = other->start; - other->tree = NULL; rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); free_extent_state(other); } } @@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree, other->state == state->state) { merge_cb(tree, state, other); state->end = other->end; - other->tree = NULL; rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); free_extent_state(other); } } @@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree, found->start, found->end, start, end); return -EEXIST; } - state->tree = tree; merge_state(tree, state); return 0; } @@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, free_extent_state(prealloc); return -EEXIST; } - prealloc->tree = tree; return 0; } @@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, wake_up(&state->wq); if (state->state == 0) { next = next_state(state); - if (state->tree) { + if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); - state->tree = NULL; + RB_CLEAR_NODE(&state->rb_node); free_extent_state(state); } else { WARN_ON(1); @@ -606,8 +609,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start <= start && - cached->end > start) { + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -843,7 +846,7 @@ again: if (cached_state && *cached_state) { state = *cached_state; if (state->start <= start && state->end > start && - state->tree) { + extent_state_in_tree(state)) { node = &state->rb_node; goto hit_next; } @@ -1069,7 +1072,7 @@ again: if (cached_state && *cached_state) { state = *cached_state; if (state->start <= start && state->end > start && - state->tree) { + extent_state_in_tree(state)) { node = &state->rb_node; goto hit_next; } @@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->end == start - 1 && state->tree) { + if (state->end == start - 1 && extent_state_in_tree(state)) { n = rb_next(&state->rb_node); while (n) { state = rb_entry(n, struct extent_state, @@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start <= start && + if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) node = &cached->rb_node; else @@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int this_mirror; - int failed_mirror; - int in_validation; -}; - -static int free_io_failure(struct inode *inode, struct io_failure_record *rec, - int did_repair) +int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; int err = 0; @@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num) +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, int mirror_num) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; @@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start - page_offset(page)); + bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ @@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %lu off %llu " - "(dev %s sector %llu)\n", page->mapping->host->i_ino, - start, rcu_str_deref(dev->name), sector); - + "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_ino(inode), start, + rcu_str_deref(dev->name), sector); bio_put(bio); return 0; } @@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, return -EROFS; for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, - start, p, mirror_num); + struct page *p = eb->pages[i]; + + ret = repair_io_failure(root->fs_info->btree_inode, start, + PAGE_CACHE_SIZE, start, p, + start - page_offset(p), mirror_num); if (ret) break; start += PAGE_CACHE_SIZE; @@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -static int clean_io_failure(u64 start, struct page *page) +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset) { u64 private; u64 private_failure; struct io_failure_record *failrec; - struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; - int did_repair = 0; int ret; private = 0; @@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page) /* there was no real error, just free the record */ pr_debug("clean_io_failure: freeing dummy error at %llu\n", failrec->start); - did_repair = 1; goto out; } if (fs_info->sb->s_flags & MS_RDONLY) @@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page) num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - ret = repair_io_failure(fs_info, start, failrec->len, - failrec->logical, page, - failrec->failed_mirror); - did_repair = !ret; + repair_io_failure(inode, start, failrec->len, + failrec->logical, page, + pg_offset, failrec->failed_mirror); } - ret = 0; } out: - if (!ret) - ret = free_io_failure(inode, failrec, did_repair); + free_io_failure(inode, failrec); - return ret; + return 0; } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * Can be called when + * - hold extent lock + * - under ordered extent + * - the inode is freeing */ +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +{ + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct io_failure_record *failrec; + struct extent_state *state, *next; -static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int failed_mirror) + if (RB_EMPTY_ROOT(&failure_tree->state)) + return; + + spin_lock(&failure_tree->lock); + state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); + while (state) { + if (state->start > end) + break; + + ASSERT(state->end <= end); + + next = next_state(state); + + failrec = (struct io_failure_record *)state->private; + free_extent_state(state); + kfree(failrec); + + state = next; + } + spin_unlock(&failure_tree->lock); +} + +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret) { - struct io_failure_record *failrec = NULL; + struct io_failure_record *failrec; u64 private; struct extent_map *em; - struct inode *inode = page->mapping->host; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - struct btrfs_io_bio *btrfs_failed_bio; - struct btrfs_io_bio *btrfs_bio; - int num_copies; int ret; - int read_mode; u64 logical; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); - ret = get_state_private(failure_tree, start, &private); if (ret) { failrec = kzalloc(sizeof(*failrec), GFP_NOFS); if (!failrec) return -ENOMEM; + failrec->start = start; failrec->len = end - start + 1; failrec->this_mirror = 0; @@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, em = NULL; } read_unlock(&em_tree->lock); - if (!em) { kfree(failrec); return -EIO; } + logical = start - em->start; logical = em->block_start + logical; if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { @@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, extent_set_compress_type(&failrec->bio_flags, em->compress_type); } - pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " - "len=%llu\n", logical, start, failrec->len); + + pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n", + logical, start, failrec->len); + failrec->logical = logical; free_extent_map(em); @@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, } } else { failrec = (struct io_failure_record *)(unsigned long)private; - pr_debug("bio_readpage_error: (found) logical=%llu, " - "start=%llu, len=%llu, validation=%d\n", + pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", failrec->logical, failrec->start, failrec->len, failrec->in_validation); /* @@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, * clean_io_failure() clean all those errors at once. */ } + + *failrec_ret = failrec; + + return 0; +} + +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int failed_mirror) +{ + int num_copies; + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, failrec->logical, failrec->len); if (num_copies == 1) { @@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, * all the retry and error correction code that follows. no * matter what the error is, it is very likely to persist. */ - pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; + return 0; } /* @@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, BUG_ON(failrec->in_validation); failrec->in_validation = 1; failrec->this_mirror = failed_mirror; - read_mode = READ_SYNC | REQ_FAILFAST_DEV; } else { /* * we're ready to fulfill a) and b) alongside. get a good copy @@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) failrec->this_mirror++; - read_mode = READ_SYNC; } if (failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; + return 0; } + return 1; +} + + +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - free_io_failure(inode, failrec, 0); - return -EIO; - } - bio->bi_end_io = failed_bio->bi_end_io; + if (!bio) + return NULL; + + bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_iter.bi_size = 0; + bio->bi_private = data; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, btrfs_bio = btrfs_io_bio(bio); btrfs_bio->csum = btrfs_bio->csum_inline; - phy_offset >>= inode->i_sb->s_blocksize_bits; - phy_offset *= csum_size; - memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + icsum *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, csum_size); } - bio_add_page(bio, page, failrec->len, start - page_offset(page)); + bio_add_page(bio, page, failrec->len, pg_offset); + + return bio; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) +{ + struct io_failure_record *failrec; + struct inode *inode = page->mapping->host; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct bio *bio; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + phy_offset >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + start - page_offset(page), + (int)phy_offset, failed_bio->bi_end_io, + NULL); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } - pr_debug("bio_readpage_error: submitting new read[%#x] to " - "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, - failrec->this_mirror, num_copies, failrec->in_validation); + pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); ret = tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, failrec->bio_flags, 0); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + return ret; } @@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (ret) uptodate = 0; else - clean_io_failure(start, page); + clean_io_failure(inode, start, page, 0); } if (likely(uptodate)) @@ -2532,6 +2602,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; + offset += len; continue; } } @@ -2539,12 +2610,12 @@ readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned off; /* Zero out the end if this page straddles i_size */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index == end_index && offset) - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + off = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && off) + zero_user_segment(page, off, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -2617,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) { - return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); -} + struct btrfs_io_bio *btrfs_bio; + struct bio *new; + new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); + if (new) { + btrfs_bio = btrfs_io_bio(new); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return new; +} /* this also allocates from the btrfs_bioset */ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) @@ -3437,16 +3517,10 @@ done_unlocked: return 0; } -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); } static noinline_for_stack int @@ -3506,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; if (!trylock_page(p)) { if (!flush) { @@ -3527,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } +static void set_btree_ioerr(struct page *page) +{ + struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode); + + SetPageError(page); + if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's + * committing (when we call filemap_fdata[write|wait]_range against + * the btree inode), we might have + * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it + * returns an error or an error happens during writeback, when we're + * committing the transaction we wouldn't know about it, since the pages + * can be no longer dirty nor marked anymore for writeback (if a + * subsequent modification to the extent buffer didn't happen before the + * transaction commit), which makes filemap_fdata[write|wait]_range not + * able to find the pages tagged with SetPageError at transaction + * commit time. So if this happens we must abort the transaction, + * otherwise we commit a super block with btree roots that point to + * btree nodes/leafs whose content on disk is invalid - either garbage + * or the content of some node/leaf from a past generation that got + * cowed or deleted and is no longer valid. + * + * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would + * not be enough - we need to distinguish between log tree extents vs + * non-log tree extents, and the next filemap_fdatawait_range() call + * will catch and clear such errors in the mapping - and that call might + * be from a log sync and not from a transaction commit. Also, checking + * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is + * not done and would not be reliable - the eb might have been released + * from memory and reading it back again means that flag would not be + * set (since it's a runtime flag, not persisted on disk). + * + * Using the flags below in the btree inode also makes us achieve the + * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started + * writeback for all dirty pages and before filemap_fdatawait_range() + * is called, the writeback for all dirty pages had already finished + * with errors - because we were not using AS_EIO/AS_ENOSPC, + * filemap_fdatawait_range() would return success, as it could not know + * that writeback errors happened (the pages were no longer tagged for + * writeback). + */ + switch (eb->log_index) { + case -1: + set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags); + break; + case 0: + set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + break; + case 1: + set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + break; + default: + BUG(); /* unexpected, logic error */ + } +} + static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { struct bio_vec *bvec; @@ -3540,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - SetPageError(page); + set_btree_ioerr(page); } end_page_writeback(page); @@ -3570,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) bio_flags = EXTENT_BIO_TREE_LOG; for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); @@ -3587,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 0, epd->bio_flags, bio_flags); epd->bio_flags = bio_flags; if (ret) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - SetPageError(p); + set_btree_ioerr(p); + end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) end_extent_buffer_writeback(eb); ret = -EIO; @@ -3601,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (unlikely(ret)) { for (; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; + clear_page_dirty_for_io(p); unlock_page(p); } } @@ -4171,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } -static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) -{ - unsigned long cnt = *((unsigned long *)ctx); - - cnt++; - *((unsigned long *)ctx) = cnt; - - /* Now we're sure that the extent is shared. */ - if (cnt > 1) - return 1; - return 0; -} - int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4200,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(inode)->root; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4213,15 +4337,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; - start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); - len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + start = round_down(start, BTRFS_I(inode)->root->sectorsize); + len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, - path, btrfs_ino(inode), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { btrfs_free_path(path); return ret; @@ -4229,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, WARN_ON(!ret); path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; /* No extents, but there might be delalloc bits */ if (found_key.objectid != btrfs_ino(inode) || @@ -4314,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - } else { - unsigned long ref_cnt = 0; + } else if (fieinfo->fi_extents_max) { + u64 bytenr = em->block_start - + (em->start - em->orig_start); disko = em->block_start + offset_in_extent; /* * As btrfs supports shared space, this information * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. + * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 + * then we're just getting a count and we can skip the + * lookup stuff. */ - ret = iterate_inodes_from_logical( - em->block_start, - BTRFS_I(inode)->root->fs_info, - path, count_ext_ref, &ref_cnt); - if (ret < 0 && ret != -ENOENT) + ret = btrfs_check_shared(NULL, root->fs_info, + root->objectid, + btrfs_ino(inode), bytenr); + if (ret < 0) goto out_free; - - if (ref_cnt > 1) + if (ret) flags |= FIEMAP_EXTENT_SHARED; + ret = 0; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -4386,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb) /* * Helper for releasing extent buffer page. */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) { unsigned long index; - unsigned long num_pages; struct page *page; int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb->start, eb->len); - index = start_idx + num_pages; - if (start_idx >= index) + index = num_extent_pages(eb->start, eb->len); + if (index == 0) return; do { index--; - page = extent_buffer_page(eb, index); + page = eb->pages[index]; if (page && mapped) { spin_lock(&page->mapping->private_lock); /* @@ -4434,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, /* One for when we alloced the page */ page_cache_release(page); } - } while (index != start_idx); + } while (index != 0); } /* @@ -4442,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_page(eb, 0); + btrfs_release_extent_buffer_page(eb); __free_extent_buffer(eb); } @@ -4585,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb, num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; + if (p != accessed) mark_page_accessed(p); } @@ -4754,7 +4878,7 @@ again: */ SetPageChecked(eb->pages[0]); for (i = 1; i < num_pages; i++) { - p = extent_buffer_page(eb, i); + p = eb->pages[i]; ClearPageChecked(p); unlock_page(p); } @@ -4799,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb) } /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_page(eb, 0); + btrfs_release_extent_buffer_page(eb); call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } @@ -4865,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (!PageDirty(page)) continue; @@ -4901,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); for (i = 0; i < num_pages; i++) - set_page_dirty(extent_buffer_page(eb, i)); + set_page_dirty(eb->pages[i]); return was_dirty; } @@ -4914,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (page) ClearPageUptodate(page); } @@ -4930,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; SetPageUptodate(page); } return 0; @@ -4970,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; @@ -4989,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (!PageUptodate(page)) { ClearPageError(page); err = __extent_read_full_page(tree, page, @@ -5018,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, return ret; for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; wait_on_page_locked(page); if (!PageUptodate(page)) ret = -EIO; @@ -5029,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, unlock_exit: i = start_i; while (locked_pages > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; i++; unlock_page(page); locked_pages--; @@ -5055,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); kaddr = page_address(page); @@ -5087,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); kaddr = page_address(page); @@ -5136,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, return -EINVAL; } - p = extent_buffer_page(eb, i); + p = eb->pages[i]; kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; @@ -5162,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -5196,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -5226,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -5257,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(dst, i); + page = dst->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); @@ -5335,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, cur, (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), + copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page, src_off_in_page, cur); src_offset += cur; @@ -5382,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), + copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ccc264e7bde1..6d4b938be986 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,8 +11,6 @@ #define EXTENT_NEW (1 << 4) #define EXTENT_DELALLOC (1 << 5) #define EXTENT_DEFRAG (1 << 6) -#define EXTENT_DEFRAG_DONE (1 << 7) -#define EXTENT_BUFFER_FILLED (1 << 8) #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) @@ -34,16 +32,16 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ #define EXTENT_BUFFER_TREE_REF 5 #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ #define EXTENT_BUFFER_DUMMY 9 #define EXTENT_BUFFER_IN_TREE 10 +#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -57,7 +55,6 @@ * map has page->private set to one. */ #define EXTENT_PAGE_PRIVATE 1 -#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; struct btrfs_root; @@ -108,7 +105,6 @@ struct extent_state { struct rb_node rb_node; /* ADD NEW ELEMENTS AFTER THIS */ - struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; @@ -126,8 +122,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - unsigned long map_start; - unsigned long map_len; unsigned long bflags; struct btrfs_fs_info *fs_info; spinlock_t refs_lock; @@ -144,7 +138,9 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; - int lock_nested; + short lock_nested; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ + short log_index; /* protects write locks */ rwlock_t lock; @@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - static inline void extent_buffer_get(struct extent_buffer *eb) { atomic_inc(&eb->refs); @@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num); +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, + int mirror_num); +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret); +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int fail_mirror); +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data); +int free_io_failure(struct inode *inode, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); +#endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); #endif -#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f46cfe45d686..783a94355efd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, @@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + if (found_key.type != BTRFS_EXTENT_CSUM_KEY) goto fail; csum_offset = (bytenr - found_key.offset) >> @@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); return ret; } @@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 offset) + struct bio *bio, u64 offset) { - int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; - - len >>= inode->i_sb->s_blocksize_bits; - len *= csum_size; - - ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, - (u32 *)(dip->csum + len), 1); - return ret; + return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u64 csum_end; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - ASSERT(start == ALIGN(start, root->sectorsize) && - (end + 1) == ALIGN(end + 1, root->sectorsize)); + ASSERT(IS_ALIGNED(start, root->sectorsize) && + IS_ALIGNED(end + 1, root->sectorsize)); path = btrfs_alloc_path(); if (!path) @@ -720,7 +710,7 @@ again: bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -756,7 +746,7 @@ again: found_next = 1; if (ret != 0) goto insert; - slot = 0; + slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || @@ -790,7 +780,7 @@ again: csum_offset = (bytenr - found_key.offset) >> root->fs_info->sb->s_blocksize_bits; - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { goto insert; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1f2b99cb55ea..a18ceabd99a8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { @@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); @@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * our prealloc extent may be smaller than * write_bytes, so scale down. */ - num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; ret = 0; } else { @@ -1590,9 +1588,8 @@ again: dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); } /* @@ -1653,7 +1650,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { + if (file->f_flags & O_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1838,6 +1835,8 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { + if (filp->private_data) + btrfs_ioctl_trans_end(filp); /* * ordered_data_close is set by settattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to @@ -1845,29 +1844,25 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * application were using truncate to replace a file in place. */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - } - if (filp->private_data) - btrfs_ioctl_trans_end(filp); return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1897,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - - /* - * We flush the dirty pages again to avoid some dirty pages in the - * range being left. - */ atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - goto out; - } + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; } atomic_inc(&root->log_batch); @@ -1982,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2000,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ mutex_unlock(&inode->i_mutex); + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); @@ -2112,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], @@ -2240,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; same_page = ((offset >> PAGE_CACHE_SHIFT) == @@ -2301,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) goto out_only_mutex; - } + } } } @@ -2638,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 len = i_size_read(inode); + u64 lockstart; + u64 lockend; + u64 start; + u64 len; int ret = 0; - lockend = max_t(u64, root->sectorsize, lockend); + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); if (lockend <= lockstart) lockend = lockstart + root->sectorsize; - lockend--; len = lockend - lockstart + 1; - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2b0a627cb5f9..33848196550e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, int num_pages; int check_crcs = 0; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; @@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, return merged; } +static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + unsigned long i; + unsigned long j; + const u64 end = info->offset + info->bytes; + const u64 bitmap_offset = offset_to_bitmap(ctl, end); + u64 bytes; + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, end); + j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); + if (j == i) + return false; + bytes = (j - i) * ctl->unit; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, end, bytes); + else + __bitmap_clear_bits(ctl, bitmap, end, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + u64 bitmap_offset; + unsigned long i; + unsigned long j; + unsigned long prev_j; + u64 bytes; + + bitmap_offset = offset_to_bitmap(ctl, info->offset); + /* If we're on a boundary, try the previous logical bitmap. */ + if (bitmap_offset == info->offset) { + if (info->offset == 0) + return false; + bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); + } + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; + j = 0; + prev_j = (unsigned long)-1; + for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { + if (j > i) + break; + prev_j = j; + } + if (prev_j == i) + return false; + + if (prev_j == (unsigned long)-1) + bytes = (i + 1) * ctl->unit; + else + bytes = (i - prev_j) * ctl->unit; + + info->offset -= bytes; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + else + __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +/* + * We prefer always to allocate from extent entries, both for clustered and + * non-clustered allocation requests. So when attempting to add a new extent + * entry, try to see if there's adjacent free space in bitmap entries, and if + * there is, migrate that space from the bitmaps to the extent. + * Like this we get better chances of satisfying space allocation requests + * because we attempt to satisfy them based on a single cache entry, and never + * on 2 or more entries - even if the entries represent a contiguous free space + * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry + * ends). + */ +static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + /* + * Only work with disconnected entries, as we can change their offset, + * and must be extent entries. + */ + ASSERT(!info->bitmap); + ASSERT(RB_EMPTY_NODE(&info->offset_index)); + + if (ctl->total_bitmaps > 0) { + bool stole_end; + bool stole_front = false; + + stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); + if (ctl->total_bitmaps > 0) + stole_front = steal_from_bitmap_to_front(ctl, info, + update_stat); + + if (stole_end || stole_front) + try_merge_free_space(ctl, info, update_stat); + } +} + int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, u64 offset, u64 bytes) { @@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, info->offset = offset; info->bytes = bytes; + RB_CLEAR_NODE(&info->offset_index); spin_lock(&ctl->tree_lock); @@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, goto out; } link: + /* + * Only steal free space from adjacent bitmaps if we're sure we're not + * going to add the new free space to existing bitmap entries - because + * that would mean unnecessary work that would be reverted. Therefore + * attempt to steal space from bitmaps if we're adding an extent entry. + */ + steal_from_bitmap(ctl, info, true); + ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); @@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space( entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); + RB_CLEAR_NODE(&entry->offset_index); bitmap = (entry->bitmap != NULL); - if (!bitmap) + if (!bitmap) { try_merge_free_space(ctl, entry, false); + steal_from_bitmap(ctl, entry, false); + } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); } @@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, { struct inode *inode = NULL; - spin_lock(&root->cache_lock); - if (root->cache_inode) - inode = igrab(root->cache_inode); - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_inode) + inode = igrab(root->ino_cache_inode); + spin_unlock(&root->ino_cache_lock); if (inode) return inode; @@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, if (IS_ERR(inode)) return inode; - spin_lock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); if (!btrfs_fs_closing(root->fs_info)) - root->cache_inode = igrab(inode); - spin_unlock(&root->cache_lock); + root->ino_cache_inode = igrab(inode); + spin_unlock(&root->ino_cache_lock); return inode; } @@ -3176,6 +3309,7 @@ again: map = NULL; add_new_bitmap(ctl, info, offset); bitmap_info = info; + info = NULL; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); @@ -3186,6 +3320,8 @@ again: if (bytes) goto again; + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); if (map) kfree(map); return 0; @@ -3260,6 +3396,7 @@ have_info: goto have_info; } + ret = 0; goto out; } diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index 85889aa82c62..64f15bb30a81 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -20,10 +20,8 @@ static struct crypto_shash *tfm; int __init btrfs_hash_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - return 0; + return PTR_ERR_OR_ZERO(tfm); } void btrfs_hash_exit(void) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 2be38df703c9..8ffa4783cbf4 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u32 item_size; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name, name_len); path = btrfs_alloc_path(); @@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key found_key; ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); - if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && location->offset == (u64)-1 && path->slots[0] != 0) { slot = path->slots[0] - 1; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid == location->objectid && - btrfs_key_type(&found_key) == btrfs_key_type(location)) { + found_key.type == location->type) { path->slots[0]--; return 0; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 888fbe19079f..83d646bd2e4b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -87,7 +87,7 @@ again: */ btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); - root->cache_progress = last; + root->ino_cache_progress = last; up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; @@ -106,7 +106,7 @@ again: if (last != (u64)-1 && last + 1 != key.objectid) { __btrfs_add_free_space(ctl, last + 1, key.objectid - last - 1); - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); } last = key.objectid; @@ -119,14 +119,14 @@ next: root->highest_objectid - last - 1); } - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); - root->cache_progress = (u64)-1; + root->ino_cache_progress = (u64)-1; btrfs_unpin_free_ino(root); out: - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_NO) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_NO) { + spin_unlock(&root->ino_cache_lock); return; } - root->cached = BTRFS_CACHE_STARTED; - spin_unlock(&root->cache_lock); + root->ino_cache_state = BTRFS_CACHE_STARTED; + spin_unlock(&root->ino_cache_lock); ret = load_free_ino_cache(root->fs_info, root); if (ret == 1) { - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); return; } @@ -196,11 +196,11 @@ again: start_caching(root); - wait_event(root->cache_wait, - root->cached == BTRFS_CACHE_FINISHED || + wait_event(root->ino_cache_wait, + root->ino_cache_state == BTRFS_CACHE_FINISHED || root->free_ino_ctl->free_space > 0); - if (root->cached == BTRFS_CACHE_FINISHED && + if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; else @@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; again: - if (root->cached == BTRFS_CACHE_FINISHED) { + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(pinned, objectid, 1); } else { down_write(&root->fs_info->commit_root_sem); - spin_lock(&root->cache_lock); - if (root->cached == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->ino_cache_lock); up_write(&root->fs_info->commit_root_sem); goto again; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); start_caching(root); @@ -235,10 +235,10 @@ again: } /* - * When a transaction is committed, we'll move those inode numbers which - * are smaller than root->cache_progress from pinned tree to free_ino tree, - * and others will just be dropped, because the commit root we were - * searching has changed. + * When a transaction is committed, we'll move those inode numbers which are + * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and + * others will just be dropped, because the commit root we were searching has + * changed. * * Must be called with root->fs_info->commit_root_sem held */ @@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ - if (info->offset > root->cache_progress) + if (info->offset > root->ino_cache_progress) goto free; - else if (info->offset + info->bytes > root->cache_progress) - count = root->cache_progress - info->offset + 1; + else if (info->offset + info->bytes > root->ino_cache_progress) + count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; @@ -462,13 +462,13 @@ again: } } - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_FINISHED) { + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { ret = -1; - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); goto out_put; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); spin_lock(&ctl->tree_lock); prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f8..fc9c0439caa3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.objectid = btrfs_ino(inode); key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); path->leave_spinning = 1; @@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end >= PAGE_CACHE_SIZE || - data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + actual_end > PAGE_CACHE_SIZE || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || (!compressed_size && (actual_end & (root->sectorsize - 1)) == 0) || end + 1 < isize || @@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow, return 0; } +static inline int inode_need_compress(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* force compress */ + if (btrfs_test_opt(root, FORCE_COMPRESS)) + return 1; + /* bad compression ratios */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + return 0; + if (btrfs_test_opt(root, COMPRESS) || + BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || + BTRFS_I(inode)->force_compress) + return 1; + return 0; +} + /* * we create compressed extents in two phases. The first * phase compresses a range of pages that have already been @@ -444,10 +461,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && - (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress) || - (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { + if (inode_need_compress(inode)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); if (!pages) { @@ -709,6 +723,18 @@ retry: unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); + + /* + * we need to redirty the pages if we decide to + * fallback to uncompressed IO, otherwise we + * will not submit these pages down to lower + * layers. + */ + extent_range_redirty_for_io(inode, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + goto retry; } goto out_free; @@ -766,8 +792,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); goto out_free_reserve; + } /* * clear dirty, set writeback and unlock the pages. @@ -959,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) - goto out_reserve; + goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); if (ret) - goto out_reserve; + goto out_drop_extent_cache; } if (disk_num_bytes < cur_alloc_size) @@ -994,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode, out: return ret; +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: @@ -1076,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -1084,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - btrfs_init_work(&async_cow->work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_cow->work, + btrfs_delalloc_helper, + async_cow_start, async_cow_submit, + async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; @@ -1425,6 +1460,26 @@ error: return ret; } +static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +{ + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + return 0; + + /* + * @defrag_bytes is a hint value, no spinlock held here, + * if is not zero, it means the file is defragging. + * Force cow if given extent needs to be defragged. + */ + if (BTRFS_I(inode)->defrag_bytes && + test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DEFRAG, 0, NULL)) + return 1; + + return 0; +} + /* * extent_io.c call back to do delayed allocation processing */ @@ -1433,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; + int force_cow = need_force_cow(inode, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { + } else if (!inode_need_compress(inode)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); } else { @@ -1535,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1557,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; + if (*bits & EXTENT_DEFRAG) + BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); @@ -1571,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + u64 len = state->end + 1 - state->start; + + spin_lock(&BTRFS_I(inode)->lock); + if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) + BTRFS_I(inode)->defrag_bytes -= len; + spin_unlock(&BTRFS_I(inode)->lock); + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1578,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len = state->end + 1 - state->start; bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { @@ -1869,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_fixup_helper, + btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2639,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + btrfs_free_io_failure_record(inode, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -2810,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *workers; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2819,15 +2888,53 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + + btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, + NULL); + btrfs_queue_work(wq, &ordered_extent->work); - if (btrfs_is_free_space_inode(inode)) - workers = root->fs_info->endio_freespace_worker; - else - workers = root->fs_info->endio_write_workers; - btrfs_queue_work(workers, &ordered_extent->work); + return 0; +} + +static int __readpage_endio_check(struct inode *inode, + struct btrfs_io_bio *io_bio, + int icsum, struct page *page, + int pgoff, u64 start, size_t len) +{ + char *kaddr; + u32 csum_expected; + u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + csum_expected = *(((u32 *)io_bio->csum) + icsum); + + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + pgoff, csum, len); + btrfs_csum_final(csum, (char *)&csum); + if (csum != csum_expected) + goto zeroit; + + kunmap_atomic(kaddr); return 0; +zeroit: + if (__ratelimit(&_rs)) + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, csum_expected); + memset(kaddr + pgoff, 1, len); + flush_dcache_page(page); + kunmap_atomic(kaddr); + if (csum_expected == 0) + return 0; + return -EIO; } /* @@ -2842,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - char *kaddr; struct btrfs_root *root = BTRFS_I(inode)->root; - u32 csum_expected; - u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); - goto good; + return 0; } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - goto good; + return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -2865,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, } phy_offset >>= inode->i_sb->s_blocksize_bits; - csum_expected = *(((u32 *)io_bio->csum) + phy_offset); - - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); - btrfs_csum_final(csum, (char *)&csum); - if (csum != csum_expected) - goto zeroit; - - kunmap_atomic(kaddr); -good: - return 0; - -zeroit: - if (__ratelimit(&_rs)) - btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(page->mapping->host), start, csum, csum_expected); - memset(kaddr + offset, 1, end - start + 1); - flush_dcache_page(page); - kunmap_atomic(kaddr); - if (csum_expected == 0) - return 0; - return -EIO; + return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, + start, (size_t)(end - start + 1)); } struct delayed_iput { @@ -3133,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { @@ -3160,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; - if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ @@ -3636,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !root->fs_info->log_root_recovering) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); @@ -4059,7 +4142,7 @@ search_again: fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != ino) break; @@ -4222,7 +4305,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; @@ -4662,6 +4746,11 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); + if (need_resched()) { + write_unlock(&map_tree->lock); + cond_resched(); + write_lock(&map_tree->lock); + } } write_unlock(&map_tree->lock); @@ -4684,6 +4773,7 @@ static void evict_inode_truncate_pages(struct inode *inode) &cached_state, GFP_NOFS); free_extent_state(state); + cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); @@ -4714,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode) /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_free_io_failure_record(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)); @@ -5169,6 +5261,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* + * If orphan cleanup did remove any orphans, it means the tree + * was modified and therefore the commit root is not the same as + * the current root anymore. This is a problem, because send + * uses the commit root and therefore can see inode items that + * don't exist in the current root anymore, and for example make + * calls to btrfs_iget, which will do tree lookups based on the + * current root and not on the commit root. Those lookups will + * fail, returning a -ESTALE error, and making send fail with + * that error. So make sure a send does not see any orphans we + * have just removed, and that it will see the same inodes + * regardless of whether a transaction commit happened before + * it started (meaning that the commit root will be the same as + * the current root) or not. + */ + if (sub_root->node != sub_root->commit_root) { + u64 sub_flags = btrfs_root_flags(&sub_root->root_item); + + if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + /* + * Assert we can't have races between dentry + * lookup called through the snapshot creation + * ioctl and the VFS. + */ + ASSERT(mutex_is_locked(&dir->i_mutex)); + + down_write(&root->fs_info->commit_root_sem); + eb = sub_root->commit_root; + sub_root->commit_root = + btrfs_root_node(sub_root); + up_write(&root->fs_info->commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; @@ -5262,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) btrfs_get_delayed_items(inode, &ins_list, &del_list); } - btrfs_set_key_type(&key, key_type); + key.type = key_type; key.offset = ctx->pos; key.objectid = btrfs_ino(inode); @@ -5287,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != key_type) + if (found_key.type != key_type) break; if (found_key.offset < ctx->pos) goto next; @@ -5499,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) int ret; key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); @@ -5531,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || - btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + found_key.type != BTRFS_DIR_INDEX_KEY) { BTRFS_I(inode)->index_cnt = 2; goto out; } @@ -5565,6 +5693,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) return ret; } +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -5594,6 +5733,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, } /* + * O_TMPFILE, set link count to 0, so that after this point, + * we fill in an inode item with the correct link count. + */ + if (!name) + set_nlink(inode, 0); + + /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. */ @@ -5631,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); key[0].objectid = objectid; - btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); @@ -5644,16 +5790,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].type = BTRFS_INODE_REF_KEY; key[1].offset = ref_objectid; sizes[1] = name_len + sizeof(*ref); } + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) - goto fail; + goto fail_unlock; inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); @@ -5676,11 +5831,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { @@ -5691,7 +5841,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5706,6 +5855,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_ino(inode), root->root_key.objectid, ret); return inode; + +fail_unlock: + unlock_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; @@ -5739,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); } else { key.objectid = ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } @@ -5840,28 +5992,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); } + out_unlock: btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); @@ -5871,6 +6023,12 @@ out_unlock: iput(inode); } return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -5905,15 +6063,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } drop_inode_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_unlock; - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5922,14 +6071,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - goto out_unlock; + goto out_unlock_inode; - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); d_instantiate(dentry, inode); out_unlock: @@ -5941,6 +6099,11 @@ out_unlock: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6048,25 +6211,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; + goto out_fail_inode; btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail; + goto out_fail_inode; err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail; + goto out_fail_inode; d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); drop_on_err = 0; out_fail: @@ -6076,23 +6244,66 @@ out_fail: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; +} + +/* Find next extent map of a given extent map, caller needs to ensure locks */ +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); } /* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert - * the new extent into the tree. + * the best fitted new extent into the tree. */ static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, - u64 map_start, u64 map_len) + u64 map_start) { + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - start_diff = map_start - em->start; - em->start = map_start; - em->len = map_len; + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6220,7 +6431,7 @@ again: struct btrfs_file_extent_item); /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { /* @@ -6263,6 +6474,8 @@ next: goto not_found; if (start + len <= found_key.offset) goto not_found; + if (start > found_key.offset) + goto next; em->start = start; em->orig_start = start; em->len = found_key.offset - start; @@ -6367,26 +6580,21 @@ insert: ret = 0; - existing = lookup_extent_mapping(em_tree, start, len); - if (existing && (existing->start > start || - existing->start + existing->len <= start)) { + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= extent_map_end(existing) || + start <= existing->start) { + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + err = merge_extent_mapping(em_tree, existing, + em, start); free_extent_map(existing); - existing = NULL; - } - if (!existing) { - existing = lookup_extent_mapping(em_tree, em->start, - em->len); - if (existing) { - err = merge_extent_mapping(em_tree, existing, - em, start, - root->sectorsize); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - err = -EIO; + if (err) { free_extent_map(em); em = NULL; } @@ -6998,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto unlock_err; + } } ret = btrfs_add_ordered_extent_dio(inode, start, @@ -7074,45 +7284,277 @@ unlock_err: return ret; } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int rw, int mirror_num) { - struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct bio *dio_bio; - u32 *csums = (u32 *)dip->csum; + int ret; + + BUG_ON(rw & REQ_WRITE); + + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DIO_REPAIR); + if (ret) + goto err; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); +err: + bio_put(bio); + return ret; +} + +static int btrfs_check_dio_repairable(struct inode *inode, + struct bio *failed_bio, + struct io_failure_record *failrec, + int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + + if (failrec->this_mirror > num_copies) { + pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + +static int dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int failed_mirror, bio_end_io_t *repair_endio, + void *repair_arg) +{ + struct io_failure_record *failrec; + struct bio *bio; + int isector; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, + failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + isector = start - btrfs_io_bio(failed_bio)->logical; + isector >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + 0, isector, repair_endio, repair_arg); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + btrfs_debug(BTRFS_I(inode)->root->fs_info, + "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = submit_dio_repair_bio(inode, bio, read_mode, + failrec->this_mirror); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +struct btrfs_retry_complete { + struct completion done; + struct inode *inode; + u64 start; + int uptodate; +}; + +static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) + goto end; + + done->uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) + clean_io_failure(done->inode, done->start, bvec->bv_page, 0); +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; u64 start; int i; + int ret; + + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); + if (ret) + return ret; + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } + + start += bvec->bv_len; + } + + return 0; +} + +static void btrfs_retry_endio(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct bio_vec *bvec; + int uptodate; + int ret; + int i; - start = dip->logical_offset; + if (err) + goto end; + + uptodate = 1; bio_for_each_segment_all(bvec, bio, i) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - struct page *page = bvec->bv_page; - char *kaddr; - u32 csum = ~(u32)0; - unsigned long flags; - - local_irq_save(flags); - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + bvec->bv_offset, - csum, bvec->bv_len); - btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr); - local_irq_restore(flags); - - flush_dcache_page(bvec->bv_page); - if (csum != csums[i]) { - btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(inode), start, csum, - csums[i]); - err = -EIO; - } + ret = __readpage_endio_check(done->inode, io_bio, i, + bvec->bv_page, 0, + done->start, bvec->bv_len); + if (!ret) + clean_io_failure(done->inode, done->start, + bvec->bv_page, 0); + else + uptodate = 0; + } + + done->uptodate = uptodate; +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + u64 offset = 0; + int i; + int ret; + + err = 0; + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + 0, start, bvec->bv_len); + if (likely(!ret)) + goto next; +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); + if (ret) { + err = ret; + goto next; } + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } +next: + offset += bvec->bv_len; start += bvec->bv_len; } + return err; +} + +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (skip_csum) { + if (unlikely(err)) + return __btrfs_correct_data_nocsum(inode, io_bio); + else + return 0; + } else { + return __btrfs_subio_endio_read(inode, io_bio, err); + } +} + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct bio *dio_bio; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + err = btrfs_subio_endio_read(inode, io_bio, err); + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); dio_bio = dip->dio_bio; @@ -7123,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) if (err) clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); dio_end_io(dio_bio, err); + + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } @@ -7146,7 +7591,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, btrfs_endio_write_helper, + finish_ordered_fn, NULL, NULL); btrfs_queue_work(root->fs_info->endio_write_workers, &ordered->work); out_test: @@ -7187,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; + if (err) + btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + + if (dip->subio_endio) + err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); + if (err) { - btrfs_err(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); dip->errors = 1; /* @@ -7223,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); } +static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, + struct inode *inode, + struct btrfs_dio_private *dip, + struct bio *bio, + u64 file_offset) +{ + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + int ret; + + /* + * We load all the csum data we need when we submit + * the first bio to reduce the csum tree search and + * contention. + */ + if (dip->logical_offset == file_offset) { + ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, + file_offset); + if (ret) + return ret; + } + + if (bio == dip->orig_bio) + return 0; + + file_offset -= dip->logical_offset; + file_offset >>= inode->i_sb->s_blocksize_bits; + io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + + return 0; +} + static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, int async_submit) @@ -7238,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, bio_get(bio); if (!write) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } @@ -7261,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); if (ret) goto err; - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, - file_offset); + } else { + ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, + file_offset); if (ret) goto err; } - map: ret = btrfs_map_bio(root, rw, bio, 0, async_submit); err: @@ -7288,19 +7771,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 submit_len = 0; u64 map_length; int nr_pages = 0; - int ret = 0; + int ret; int async_submit = 0; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); - if (ret) { - bio_put(orig_bio); + if (ret) return -EIO; - } if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; + dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } @@ -7314,14 +7796,16 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; + bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (unlikely(map_length < submit_len + bvec->bv_len || + if (map_length < submit_len + bvec->bv_len || bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len)) { + bvec->bv_offset) < bvec->bv_len) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -7350,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, goto out_err; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, @@ -7393,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; struct bio *io_bio; + struct btrfs_io_bio *btrfs_bio; int skip_sum; - int sum_len; int write = rw & REQ_WRITE; int ret = 0; - u16 csum_size; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -7407,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, goto free_ordered; } - if (!skip_sum && !write) { - csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_iter.bi_size >> - inode->i_sb->s_blocksize_bits; - sum_len *= csum_size; - } else { - sum_len = 0; - } - - dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); + dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; @@ -7428,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; - dip->errors = 0; dip->orig_bio = io_bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); + btrfs_bio = btrfs_io_bio(io_bio); + btrfs_bio->logical = file_offset; - if (write) + if (write) { io_bio->bi_end_io = btrfs_endio_direct_write; - else + } else { io_bio->bi_end_io = btrfs_endio_direct_read; + dip->subio_endio = btrfs_subio_endio_read; + } ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + if (btrfs_bio->end_io) + btrfs_bio->end_io(btrfs_bio, ret); free_io_bio: bio_put(io_bio); @@ -7522,7 +8002,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, count); + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); if (rw & WRITE) { /* @@ -7537,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; - } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags))) { + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; @@ -7939,27 +8420,6 @@ static int btrfs_truncate(struct inode *inode) BUG_ON(ret); /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_ordered_operation(trans, root, inode); - - /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to * first truncate that entire inode. So set this flag so we write out @@ -8050,6 +8510,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) @@ -8078,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->csum_bytes = 0; @@ -8106,7 +8568,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); - INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -8137,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); + WARN_ON(BTRFS_I(inode)->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8146,17 +8608,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", @@ -8338,12 +8789,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = 0; /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction */ - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ @@ -8391,12 +8840,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ btrfs_pin_log_trans(root); } - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) - btrfs_add_ordered_operation(trans, root, old_inode); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); @@ -8476,6 +8919,16 @@ out_notrans: return ret; } +static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -8514,7 +8967,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + WARN_ON_ONCE(!inode); + btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, + btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -8559,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (unlikely(!work)) { + if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); else @@ -8718,12 +9173,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -8732,34 +9181,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; + goto out_unlock_inode; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - drop_inode = 1; - goto out_unlock; + goto out_unlock_inode; } key.objectid = btrfs_ino(inode); key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { - drop_inode = 1; btrfs_free_path(path); - goto out_unlock; + goto out_unlock_inode; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -8783,12 +9230,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); - if (err) + if (err) { drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_unlock: - if (!err) - d_instantiate(dentry, inode); btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); @@ -8796,6 +9246,11 @@ out_unlock: } btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -8979,14 +9434,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ret = btrfs_init_inode_security(trans, inode, dir, NULL); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - goto out; - inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -8994,10 +9441,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; ret = btrfs_orphan_add(trans, inode); if (ret) - goto out; + goto out_inode; + /* + * We set number of links to 0 in btrfs_new_inode(), and here we set + * it to 1 because d_tmpfile() will issue a warning if the count is 0, + * through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); + unlock_new_inode(inode); d_tmpfile(dentry, inode); mark_inode_dirty(inode); @@ -9007,8 +9470,12 @@ out: iput(inode); btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); - return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + } static const struct inode_operations btrfs_dir_inode_operations = { @@ -9019,7 +9486,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, - .rename = btrfs_rename, + .rename2 = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 47aceb494d1d..e732274f1afd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) goto out_drop; } else { + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } @@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(&root_item, 0); @@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir, key.objectid = objectid; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); if (ret) @@ -711,39 +713,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - ret = btrfs_orphan_cleanup(pending_snapshot->snap); - if (ret) - goto fail; - - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -915,7 +884,7 @@ out_unlock: * file you want to defrag, we return 0 to let you know to skip this * part of the file */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) +static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -950,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) */ static int find_new_extents(struct btrfs_root *root, struct inode *inode, u64 newer_than, - u64 *off, int thresh) + u64 *off, u32 thresh) { struct btrfs_path *path; struct btrfs_key min_key; @@ -969,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root, min_key.offset = *off; while (1) { - path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); process_slot: if (min_key.objectid != ino) goto none; @@ -1052,15 +1018,17 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || - (em->block_start + em->block_len == next->block_start)) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + else if ((em->block_start + em->block_len == next->block_start) && + (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) ret = false; free_extent_map(next); return ret; } -static int should_defrag_range(struct inode *inode, u64 start, int thresh, +static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, u64 *last_len, u64 *skip, u64 *defrag_end, int compress) { @@ -1088,7 +1056,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh, } next_mergeable = defrag_check_next_extent(inode, em); - /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it @@ -1291,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; - int extent_thresh = range->extent_thresh; + u32 extent_thresh = range->extent_thresh; unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); @@ -1367,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, inode->i_mapping->writeback_index = i; while (i <= last_index && defrag_count < max_to_defrag && - (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) { + (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { /* * make sure we stop running if someone unmounts * the FS @@ -1391,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ - next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); i = max(i + 1, next); continue; } @@ -1586,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); if (mod < 0) { if (new_size > old_size) { @@ -1735,7 +1701,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | BTRFS_SUBVOL_QGROUP_INHERIT)) { ret = -EOPNOTSUPP; - goto out; + goto free_args; } if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) @@ -1745,27 +1711,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { if (vol_args->size > PAGE_CACHE_SIZE) { ret = -EINVAL; - goto out; + goto free_args; } inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); if (IS_ERR(inherit)) { ret = PTR_ERR(inherit); - goto out; + goto free_args; } } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, readonly, inherit); + if (ret) + goto free_inherit; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) + if (ptr && copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), + ptr, sizeof(*ptr))) ret = -EFAULT; -out: - kfree(vol_args); + +free_inherit: kfree(inherit); +free_args: + kfree(vol_args); return ret; } @@ -2117,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk->min_type; key.offset = sk->min_offset; - path->keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { @@ -2554,9 +2522,9 @@ out_unlock: ASSERT(dest->send_in_progress == 0); /* the last ref */ - if (dest->cache_inode) { - iput(dest->cache_inode); - dest->cache_inode = NULL; + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; } } out_dput: @@ -2662,6 +2630,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); + if (!ret) + btrfs_info(root->fs_info, "disk added %s",vol_args->name); + kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); @@ -2685,7 +2656,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto err_drop; } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; @@ -2701,8 +2672,12 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + out: kfree(vol_args); +err_drop: mnt_drop_write_file(file); return ret; } @@ -2764,8 +2739,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) } di_args->devid = dev->devid; - di_args->bytes_used = dev->bytes_used; - di_args->total_bytes = dev->total_bytes; + di_args->bytes_used = btrfs_device_get_bytes_used(dev); + di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { struct rcu_string *name; @@ -3191,7 +3166,7 @@ static void clone_update_extent_map(struct inode *inode, em->start + em->len - 1, 0); } - if (unlikely(ret)) + if (ret) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); } @@ -3226,7 +3201,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); + buf = vmalloc(root->nodesize); if (!buf) return ret; @@ -3279,11 +3254,11 @@ process_slot: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(src)) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + if (key.type == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int type; u32 size; @@ -3527,7 +3502,8 @@ process_slot: btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - last_dest_end = new_key.offset + datal; + last_dest_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen); @@ -5309,6 +5285,12 @@ long btrfs_ioctl(struct file *file, unsigned int if (ret) return ret; ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + /* + * The transaction thread may want to do more work, + * namely it pokes the cleaner ktread that will start + * processing uncleaned subvols. + */ + wake_up_process(root->fs_info->transaction_kthread); return ret; } case BTRFS_IOC_START_SYNC: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index dfad8514f0da..78285f30909e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws, char *data_in; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7187b14faa6c..ac734ec4cc20 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, trace_btrfs_ordered_extent_remove(inode, entry); - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (!root->nr_ordered_extents) { spin_lock(&root->fs_info->ordered_root_lock); BUG_ON(list_empty(&root->ordered_root)); @@ -627,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, + btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(root->fs_info->flush_workers, @@ -687,81 +676,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) } /* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct btrfs_transaction *cur_trans = trans->transaction; - struct list_head splice; - struct list_head works; - struct btrfs_delalloc_work *work, *next; - int ret = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); - - mutex_lock(&root->fs_info->ordered_extent_flush_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - list_splice_init(&cur_trans->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - if (!inode) - continue; - - if (!wait) - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - work = btrfs_alloc_delalloc_work(inode, wait, 1); - if (!work) { - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) - list_add_tail(&btrfs_inode->ordered_operations, - &splice); - list_splice_tail(&splice, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &work->work); - - cond_resched(); - spin_lock(&root->fs_info->ordered_root_lock); - } - spin_unlock(&root->fs_info->ordered_root_lock); -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); - return ret; -} - -/* * Used to start IO or wait for a given ordered extent to finish. * * If wait is one, this effectively waits on page writeback for all the pages @@ -1120,42 +1034,6 @@ out: return index; } - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod <= root->fs_info->last_trans_committed) - return; - - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_root_lock); -} - int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 246897058efb..d81a274d621e 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait); -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 65793edb38ca..47767d5b8f0b 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); @@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9626b4ad3b9a..647ab12fdf5d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); - type = btrfs_key_type(&key); + type = key.type; printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", i, key.objectid, type, key.offset, @@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, btrfs_node_blockptr(c, i), - btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && level != 1) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98cb6b2630f9..48b60dbf807f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + if (btrfs_test_is_dummy_root(quota_root)) return 0; -#endif + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; + /* + * Avoid a transaction abort by catching -EEXIST here. In that + * case, we proceed by re-initializing the existing structure + * on disk. + */ + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); - if (ret) + if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; @@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); - if (ret) + if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; @@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1201,6 +1205,50 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } + +static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + /* + * Ignore seq and type here, we're looking for any operation + * at all related to this extent on that root. + */ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + return 0; +} + +static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node *n; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + n = fs_info->qgroup_op_tree.rb_node; + while (n) { + cur = rb_entry(n, struct btrfs_qgroup_operation, n); + cmp = comp_oper_exist(cur, oper); + if (cmp < 0) { + n = n->rb_right; + } else if (cmp) { + n = n->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + static int comp_oper(struct btrfs_qgroup_operation *oper1, struct btrfs_qgroup_operation *oper2) { @@ -1290,6 +1338,25 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); INIT_LIST_HEAD(&oper->elem.list); oper->elem.seq = 0; + + trace_btrfs_qgroup_record_ref(oper); + + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { + /* + * If any operation for this bytenr/ref_root combo + * exists, then we know it's not exclusively owned and + * shouldn't be queued up. + * + * This also catches the case where we have a cloned + * extent that gets queued up multiple times during + * drop snapshot. + */ + if (qgroup_oper_exists(fs_info, oper)) { + kfree(oper); + return 0; + } + } + ret = insert_qgroup_oper(fs_info, oper); if (ret) { /* Shouldn't happen so have an assert for developers */ @@ -1884,6 +1951,111 @@ out: } /* + * Process a reference to a shared subtree. This type of operation is + * queued during snapshot removal when we encounter extents which are + * shared between more than one root. + */ +static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup_list *glist; + struct ulist *parents; + int ret = 0; + int err; + struct btrfs_qgroup *qg; + u64 root_obj = 0; + struct seq_list elem = {}; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + elem.seq, &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) + goto out; + + if (roots->nnodes != 1) + goto out; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ + /* + * If we find our ref root then that means all refs + * this extent has to the root have not yet been + * deleted. In that case, we do nothing and let the + * last ref for this bytenr drive our update. + * + * This can happen for example if an extent is + * referenced multiple times in a snapshot (clone, + * etc). If we are in the middle of snapshot removal, + * queued updates for such an extent will find the + * root if we have not yet finished removing the + * snapshot. + */ + if (unode->val == oper->ref_root) + goto out; + + root_obj = unode->val; + BUG_ON(!root_obj); + + spin_lock(&fs_info->qgroup_lock); + qg = find_qgroup_rb(fs_info, root_obj); + if (!qg) + goto out_unlock; + + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* + * Adjust counts for parent groups. First we find all + * parents, then in the 2nd loop we do the adjustment + * while adding parents of the parents to our ulist. + */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(parents, &uiter))) { + qg = u64_to_ptr(unode->aux); + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + } + +out_unlock: + spin_unlock(&fs_info->qgroup_lock); + +out: + ulist_free(roots); + ulist_free(parents); + return ret; +} + +/* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and * then the space is accounted accordingly to the different roots. The @@ -1911,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, ASSERT(is_fstree(oper->ref_root)); + trace_btrfs_qgroup_account(oper); + switch (oper->type) { case BTRFS_QGROUP_OPER_ADD_EXCL: case BTRFS_QGROUP_OPER_SUB_EXCL: @@ -1920,6 +2094,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, case BTRFS_QGROUP_OPER_SUB_SHARED: ret = qgroup_shared_accounting(trans, fs_info, oper); break; + case BTRFS_QGROUP_OPER_SUB_SUBTREE: + ret = qgroup_subtree_accounting(trans, fs_info, oper); + break; default: ASSERT(0); } @@ -2068,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; - int srcroot_level; srckey.objectid = srcid; srckey.type = BTRFS_ROOT_ITEM_KEY; @@ -2080,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, } rcu_read_lock(); - srcroot_level = btrfs_header_level(srcroot->node); - level_size = btrfs_level_size(srcroot, srcroot_level); + level_size = srcroot->nodesize; rcu_read_unlock(); } @@ -2397,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) - num_bytes = fs_info->extent_root->leafsize; + num_bytes = fs_info->extent_root->nodesize; else num_bytes = found.offset; @@ -2551,6 +2726,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 5952ff1fbd7a..18cc68ca3090 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type { BTRFS_QGROUP_OPER_ADD_SHARED, BTRFS_QGROUP_OPER_SUB_EXCL, BTRFS_QGROUP_OPER_SUB_SHARED, + BTRFS_QGROUP_OPER_SUB_SUBTREE, }; struct btrfs_qgroup_operation { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4a88f073fdd7..6a41631cb959 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { unsigned long nr = stripe_len * nr_stripes; - return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); } /* @@ -1416,7 +1416,8 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + rmw_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio) static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + read_rebuild_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1440,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, btrfs_rmw_helper, + unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; @@ -1722,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1937,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 09230cf3a244..b63ae20618fb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (!re) return NULL; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; re->logical = logical; re->blocksize = blocksize; re->top = *top; @@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, btrfs_readahead_helper, + reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 65245a07275b..74257d6436ad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -736,7 +736,8 @@ again: err = ret; goto out; } - BUG_ON(!ret || !path1->slots[0]); + ASSERT(ret); + ASSERT(path1->slots[0]); path1->slots[0]--; @@ -746,10 +747,10 @@ again: * the backref was added previously when processing * backref of type BTRFS_TREE_BLOCK_REF_KEY */ - BUG_ON(!list_is_singular(&cur->upper)); + ASSERT(list_is_singular(&cur->upper)); edge = list_entry(cur->upper.next, struct backref_edge, list[LOWER]); - BUG_ON(!list_empty(&edge->list[UPPER])); + ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* * add the upper level block to pending list if we need @@ -831,7 +832,7 @@ again: cur->cowonly = 1; } #else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY); if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { #endif if (key.objectid == key.offset) { @@ -840,7 +841,7 @@ again: * backref of this type. */ root = find_reloc_root(rc, cur->bytenr); - BUG_ON(!root); + ASSERT(root); cur->root = root; break; } @@ -868,7 +869,7 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); - BUG_ON(!upper->checked); + ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } list_add_tail(&edge->list[LOWER], &cur->upper); @@ -892,7 +893,7 @@ again: if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ - BUG_ON(btrfs_root_bytenr(&root->root_item) != + ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); if (should_ignore_root(root)) list_add(&cur->list, &useless); @@ -927,7 +928,7 @@ again: need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { - BUG_ON(btrfs_root_bytenr(&root->root_item) != + ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); if (should_ignore_root(root)) list_add(&lower->list, &useless); @@ -977,12 +978,15 @@ again: need_check = false; list_add_tail(&edge->list[UPPER], &list); - } else + } else { + if (upper->checked) + need_check = true; INIT_LIST_HEAD(&edge->list[UPPER]); + } } else { upper = rb_entry(rb_node, struct backref_node, rb_node); - BUG_ON(!upper->checked); + ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); if (!upper->owner) upper->owner = btrfs_header_owner(eb); @@ -1026,7 +1030,7 @@ next: * everything goes well, connect backref nodes and insert backref nodes * into the cache. */ - BUG_ON(!node->checked); + ASSERT(node->checked); cowonly = node->cowonly; if (!cowonly) { rb_node = tree_insert(&cache->rb_root, node->bytenr, @@ -1062,8 +1066,21 @@ next: continue; } - BUG_ON(!upper->checked); - BUG_ON(cowonly != upper->cowonly); + if (!upper->checked) { + /* + * Still want to blow up for developers since this is a + * logic bug. + */ + ASSERT(0); + err = -EINVAL; + goto out; + } + if (cowonly != upper->cowonly) { + ASSERT(0); + err = -EINVAL; + goto out; + } + if (!cowonly) { rb_node = tree_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); @@ -1086,7 +1103,7 @@ next: while (!list_empty(&useless)) { upper = list_entry(useless.next, struct backref_node, list); list_del_init(&upper->list); - BUG_ON(!list_empty(&upper->upper)); + ASSERT(list_empty(&upper->upper)); if (upper == node) node = NULL; if (upper->lowest) { @@ -1119,29 +1136,45 @@ out: if (err) { while (!list_empty(&useless)) { lower = list_entry(useless.next, - struct backref_node, upper); - list_del_init(&lower->upper); + struct backref_node, list); + list_del_init(&lower->list); } - upper = node; - INIT_LIST_HEAD(&list); - while (upper) { - if (RB_EMPTY_NODE(&upper->rb_node)) { - list_splice_tail(&upper->upper, &list); - free_backref_node(cache, upper); - } - - if (list_empty(&list)) - break; - - edge = list_entry(list.next, struct backref_edge, - list[LOWER]); + while (!list_empty(&list)) { + edge = list_first_entry(&list, struct backref_edge, + list[UPPER]); + list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; upper = edge->node[UPPER]; free_backref_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes + * and isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &useless); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to proces */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + if (list_empty(&upper->upper)) + list_add(&upper->list, &useless); + } + + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, list); + list_del_init(&lower->list); + free_backref_node(cache, lower); } return ERR_PTR(err); } - BUG_ON(node && node->detached); + ASSERT(!node || !node->detached); return node; } @@ -1787,7 +1820,7 @@ again: btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = btrfs_level_size(dest, level - 1); + blocksize = dest->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { @@ -1813,8 +1846,7 @@ again: break; } - eb = read_tree_block(dest, old_bytenr, blocksize, - old_ptr_gen); + eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); @@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, u64 bytenr; u64 ptr_gen = 0; u64 last_snapshot; - u32 blocksize; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); @@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = btrfs_level_size(root, i - 1); - eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + eb = read_tree_block(root, bytenr, ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list) } static noinline_for_stack -int merge_reloc_roots(struct reloc_control *rc) +void merge_reloc_roots(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *reloc_root; @@ -2397,7 +2427,6 @@ out: } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return ret; } static void free_block_list(struct rb_root *blocks) @@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, if (next->processed && (reserve || next != node)) break; - num_bytes += btrfs_level_size(rc->extent_root, - next->level); + num_bytes += rc->extent_root->nodesize; if (list_empty(&next->upper)) break; @@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } - blocksize = btrfs_level_size(root, node->level); + blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, blocksize, generation); + eb = read_tree_block(root, bytenr, generation); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; @@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc, u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { - blocksize = btrfs_level_size(rc->extent_root, node->level); + blocksize = rc->extent_root->nodesize; mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; @@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + block->key.offset); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc, return 0; } -static int reada_tree_block(struct reloc_control *rc, - struct tree_block *block) -{ - BUG_ON(block->key_ready); - if (block->key.type == BTRFS_METADATA_ITEM_KEY) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, - rc->extent_root->leafsize); - else - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - return 0; -} - /* * helper function to relocate a tree block */ @@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - reada_tree_block(rc, block); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid); rb_node = rb_next(rb_node); } @@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = rc->extent_root->leafsize; + block->key.objectid = rc->extent_root->nodesize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize = btrfs_level_size(rc->extent_root, 0); + u32 blocksize = rc->extent_root->nodesize; int ret = 0; int err = 0; @@ -3783,7 +3798,7 @@ next: } if (key.type == BTRFS_METADATA_ITEM_KEY && - key.objectid + rc->extent_root->leafsize <= + key.objectid + rc->extent_root->nodesize <= rc->search_start) { path->slots[0]++; goto next; @@ -3801,7 +3816,7 @@ next: rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + - rc->extent_root->leafsize; + rc->extent_root->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } @@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b6d198f5181e..efa083113827 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -137,7 +137,6 @@ struct scrub_ctx { int pages_per_rd_bio; u32 sectorsize; u32 nodesize; - u32 leafsize; int is_dev_replace; struct scrub_wr_ctx wr_ctx; @@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx { struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; - char *scratch_buf; - char *msg_buf; const char *errstr; sector_t sector; u64 logical; struct btrfs_device *dev; - int msg_bufsize; - int scratch_bufsize; }; - static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -428,8 +422,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, - NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrub_helper, + scrub_bio_end_io_worker, NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) } sctx->first_free = 0; sctx->nodesize = dev->dev_root->nodesize; - sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); @@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u64 ref_root; u32 item_size; u8 ref_level; - const int bufsize = 4096; int ret; WARN_ON(sblock->page_count < 1); @@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) fs_info = sblock->sctx->dev_root->fs_info; path = btrfs_alloc_path(); + if (!path) + return; - swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); - swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); swarn.sector = (sblock->pagev[0]->physical) >> 9; swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; - swarn.msg_bufsize = bufsize; - swarn.scratch_bufsize = bufsize; - - if (!path || !swarn.scratch_buf || !swarn.msg_buf) - goto out; ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); @@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) out: btrfs_free_path(path); - kfree(swarn.scratch_buf); - kfree(swarn.msg_buf); } static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) @@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - fs_info = BTRFS_I(inode)->root->fs_info; - ret = repair_io_failure(fs_info, offset, PAGE_SIZE, + ret = repair_io_failure(inode, offset, PAGE_SIZE, fixup->logical, page, + offset - page_offset(page), fixup->mirror_num); unlock_page(page); corrected = !ret; @@ -999,8 +984,8 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, - NULL, NULL); + btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, + scrub_fixup_nodatasum, NULL, NULL); btrfs_queue_work(fs_info->scrub_workers, &fixup_nodatasum->work); goto out; @@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, return; } +static inline int scrub_check_fsid(u8 fsid[], + struct scrub_page *spage) +{ + struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + int ret; + + ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE); + return !ret; +} + static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, h = (struct btrfs_header *)mapped_buffer; if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || - memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + !scrub_check_fsid(h->fsid, sblock->pagev[0]) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; @@ -1616,7 +1611,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, + scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -1750,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) ++fail; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) ++fail; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) ++fail; - WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -1790,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_root *root = sctx->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; @@ -1816,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (sblock->pagev[0]->generation != btrfs_super_generation(s)) ++fail_gen; - if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) ++fail_cor; len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; @@ -2195,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -2486,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, btrfs_item_key_to_cpu(l, &key, slot); if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = root->leafsize; + bytes = root->nodesize; else bytes = key.offset; @@ -2713,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (found_key.objectid != scrub_dev->devid) break; - if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) + if (found_key.type != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) @@ -2827,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; - gen = root->fs_info->last_trans_committed; + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != root->fs_info->fs_devices) + gen = scrub_dev->generation; + else + gen = root->fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, @@ -2904,21 +2901,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + struct rcu_string *name; if (btrfs_fs_closing(fs_info)) return -EINVAL; - /* - * check some assumptions - */ - if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - btrfs_err(fs_info, - "scrub: size assumption nodesize == leafsize (%d == %d) fails", - fs_info->chunk_root->nodesize, - fs_info->chunk_root->leafsize); - return -EINVAL; - } - if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum @@ -2965,6 +2952,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -ENODEV; } + if (!is_dev_replace && !readonly && !dev->writeable) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_lock(); + name = rcu_dereference(dev->name); + btrfs_err(fs_info, "scrub: device %s is not writable", + name->str); + rcu_read_unlock(); + return -EROFS; + } + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); @@ -3203,7 +3200,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, + copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); btrfs_queue_work(fs_info->scrub_nocow_workers, &nocow_ctx->work); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6528aa662181..874828dd0a86 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) set_fs(KERNEL_DS); while (pos < len) { - ret = vfs_write(filp, (char *)buf + pos, len - pos, off); + ret = vfs_write(filp, (__force const char __user *)buf + pos, + len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; @@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key->type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* + * Start with a small buffer (1 page). If later we end up needing more + * space, which can happen for xattrs on a fs with a leaf size greater + * then the page size, attempt to increase the buffer. Typically xattr + * values are small. + */ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > buf_len) { + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len > buf_len) { + if (name_len + data_len > PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len > buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, + GFP_NOFS | __GFP_NOWARN); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + kvfree(buf); return ret; } @@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, if (ret < 0 && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { - ret = 1; + ret = 0; break; } @@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; - current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; if (ret < 0) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8e16bca69c56..a2b97ef10317 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -60,6 +60,7 @@ #include "backref.h" #include "tests/btrfs-tests.h" +#include "qgroup.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ + close_ctree(btrfs_sb(sb)->tree_root); } enum { @@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; - bool compress = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) @@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: - compress = true; if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_and_info(root, FORCE_COMPRESS, "force %s compression", compress_type); - } else if (compress) { + } else { if (!btrfs_test_opt(root, COMPRESS)) btrfs_info(root->fs_info, "btrfs: use %s compression", compress_type); + /* + * If we remount from compress-force=xxx to + * compress=xxx, we need clear FORCE_COMPRESS + * flag, otherwise, there is no way for users + * to disable forcible compression separately. + */ + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } break; case Opt_ssd: @@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -922,13 +921,7 @@ setup_root: return dget(sb->s_root); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_DISCONNECTED; - spin_unlock(&dentry->d_lock); - } - return dentry; + return d_obtain_root(inode); } static int btrfs_fill_super(struct super_block *sb, @@ -1021,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nodatacow"); if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); - if (info->max_inline != 8192 * 1024) + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) seq_printf(seq, ",alloc_start=%llu", info->alloc_start); @@ -1222,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return root; } +static int parse_security_options(char *orig_opts, + struct security_mnt_opts *sec_opts) +{ + char *secdata = NULL; + int ret = 0; + + secdata = alloc_secdata(); + if (!secdata) + return -ENOMEM; + ret = security_sb_copy_data(orig_opts, secdata); + if (ret) { + free_secdata(secdata); + return ret; + } + ret = security_sb_parse_opts_str(secdata, sec_opts); + free_secdata(secdata); + return ret; +} + +static int setup_security_options(struct btrfs_fs_info *fs_info, + struct super_block *sb, + struct security_mnt_opts *sec_opts) +{ + int ret = 0; + + /* + * Call security_sb_set_mnt_opts() to check whether new sec_opts + * is valid. + */ + ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); + if (ret) + return ret; + +#ifdef CONFIG_SECURITY + if (!fs_info->security_opts.num_mnt_opts) { + /* first time security setup, copy sec_opts to fs_info */ + memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); + } else { + /* + * Since SELinux(the only one supports security_mnt_opts) does + * NOT support changing context during remount/mount same sb, + * This must be the same or part of the same security options, + * just free it. + */ + security_free_mnt_opts(sec_opts); + } +#endif + return ret; +} + /* * Find a superblock for the given device / mount point. * @@ -1236,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; + struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; @@ -1258,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return root; } + security_init_mnt_opts(&new_sec_opts); + if (data) { + error = parse_security_options(data, &new_sec_opts); + if (error) + return ERR_PTR(error); + } + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - return ERR_PTR(error); + goto error_sec_opts; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -1269,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) - return ERR_PTR(-ENOMEM); + if (!fs_info) { + error = -ENOMEM; + goto error_sec_opts; + } fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; goto error_fs_info; @@ -1313,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) + if (IS_ERR(root)) { deactivate_locked_super(s); + error = PTR_ERR(root); + goto error_sec_opts; + } + + fs_info = btrfs_sb(s); + error = setup_security_options(fs_info, s, &new_sec_opts); + if (error) { + dput(root); + deactivate_locked_super(s); + goto error_sec_opts; + } return root; @@ -1322,6 +1387,8 @@ error_close_devices: btrfs_close_devices(fs_devices); error_fs_info: free_fs_info(fs_info); +error_sec_opts: + security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error); } @@ -1403,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); btrfs_remount_prepare(fs_info); + if (data) { + struct security_mnt_opts new_sec_opts; + + security_init_mnt_opts(&new_sec_opts); + ret = parse_security_options(data, &new_sec_opts); + if (ret) + goto restore; + ret = setup_security_options(fs_info, sb, + &new_sec_opts); + if (ret) { + security_free_mnt_opts(&new_sec_opts); + goto restore; + } + } + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1672,6 +1754,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) return 0; } +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes, order of allocations and the + * 'alloc_start' value, this is a close approximation of the actual use but + * there are other factors that may change the result (like a new metadata + * chunk). + * + * FIXME: not accurate for mixed block groups, total and free/used are ok, + * available appears slightly larger. + */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); @@ -1682,36 +1779,66 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; - /* holding chunk_muext to avoid allocating new chunks */ + /* + * holding chunk_muext to avoid allocating new chunks, holding + * device_list_mutex to avoid the device being removed + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + total_free_data += found->disk_total - found->disk_used; total_free_data -= btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) { + switch (i) { + case BTRFS_RAID_DUP: + case BTRFS_RAID_RAID1: + case BTRFS_RAID_RAID10: + factor = 2; + } + } + } } total_used += found->disk_used; } + rcu_read_unlock(); - buf->f_namelen = BTRFS_NAME_LEN; - buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + buf->f_bfree -= block_rsv->size >> bits; + spin_unlock(&block_rsv->lock); + buf->f_bavail = total_free_data; ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); return ret; } - buf->f_bavail += total_free_data; + buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1737,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("btrfs"); @@ -1961,11 +2088,15 @@ static int __init init_btrfs_fs(void) err = btrfs_prelim_ref_init(); if (err) + goto free_delayed_ref; + + err = btrfs_end_io_wq_init(); + if (err) goto free_prelim_ref; err = btrfs_interface_init(); if (err) - goto free_delayed_ref; + goto free_end_io_wq; btrfs_init_lockdep(); @@ -1983,6 +2114,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_end_io_wq: + btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 78699364f537..b2e7bb4393f6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); +BTRFS_ATTR(global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); +BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) @@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ -BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) +BTRFS_ATTR(field, btrfs_space_info_show_##field) static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, struct kobj_attribute *a, @@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); +BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(flags), @@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); + char *label = fs_info->super_copy->label; + return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_trans_handle *trans; struct btrfs_root *root = fs_info->fs_root; int ret; + size_t p_len; - if (len >= BTRFS_LABEL_SIZE) + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + /* + * p_len is the len until the first occurrence of either + * '\n' or '\0' + */ + p_len = strcspn(buf, "\n"); + + if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; trans = btrfs_start_transaction(root, 0); @@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return PTR_ERR(trans); spin_lock(&root->fs_info->super_lock); - strcpy(fs_info->super_copy->label, buf); + memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); + memcpy(fs_info->super_copy->label, buf, p_len); spin_unlock(&root->fs_info->super_lock); ret = btrfs_commit_transaction(trans, root); @@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return ret; } -BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); - -static ssize_t btrfs_no_store(struct kobject *kobj, - struct kobj_attribute *a, - const char *buf, size_t len) -{ - return -EPERM; -} +BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); +BTRFS_ATTR(nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); +BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); +BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); static struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), @@ -614,7 +619,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, if (!fs_info->device_dir_kobj) return -EINVAL; - if (one_device) { + if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index ac46df37504c..f7dd298b3cf6 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -20,16 +20,20 @@ enum btrfs_feature_set { .store = _store, \ } -#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ -static struct kobj_attribute btrfs_attr_##_name = \ - __INIT_KOBJ_ATTR(_name, _mode, _show, _store) -#define BTRFS_ATTR(_name, _mode, _show) \ - BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_name, _show) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + #define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) #define BTRFS_RAID_ATTR(_name, _show) \ -static struct kobj_attribute btrfs_raid_attr_##_name = \ + static struct kobj_attribute btrfs_raid_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + #define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index c8d9ddf84c69..2299bfde39ee 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void) cache->key.offset = 1024 * 1024 * 1024; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = 4096; + cache->full_stripe_len = 4096; spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); @@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) return 0; } +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return ctl->free_extents > 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int +check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, + const int num_extents, + const int num_bitmaps) +{ + if (cache->free_space_ctl->free_extents != num_extents) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->free_extents, num_extents); + return -EINVAL; + } + if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->total_bitmaps, num_bitmaps); + return -EINVAL; + } + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int check_cache_empty(struct btrfs_block_group_cache *cache) +{ + u64 offset; + u64 max_extent_size; + + /* + * Now lets confirm that there's absolutely no free space left to + * allocate. + */ + if (cache->free_space_ctl->free_space != 0) { + test_msg("Cache free space is not 0\n"); + return -EINVAL; + } + + /* And any allocation request, no matter how small, should fail now. */ + offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, + &max_extent_size); + if (offset != 0) { + test_msg("Space allocation did not fail, returned offset: %llu", + offset); + return -EINVAL; + } + + /* And no extent nor bitmap entries in the cache anymore. */ + return check_num_extents_and_bitmaps(cache, 0, 0); +} + +/* + * Before we were able to steal free space from a bitmap entry to an extent + * entry, we could end up with 2 entries representing a contiguous free space. + * One would be an extent entry and the other a bitmap entry. Since in order + * to allocate space to a caller we use only 1 entry, we couldn't return that + * whole range to the caller if it was requested. This forced the caller to + * either assume ENOSPC or perform several smaller space allocations, which + * wasn't optimal as they could be spread all over the block group while under + * concurrency (extra overhead and fragmentation). + * + * This stealing approach is benefical, since we always prefer to allocate from + * extent entries, both for clustered and non-clustered allocation requests. + */ +static int +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +{ + int ret; + u64 offset; + u64 max_extent_size; + + bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, + struct btrfs_free_space *); + + test_msg("Running space stealing from bitmap to extent\n"); + + /* + * For this test, we want to ensure we end up with an extent entry + * immediately adjacent to a bitmap entry, where the bitmap starts + * at an offset where the extent entry ends. We keep adding and + * removing free space to reach into this state, but to get there + * we need to reach a point where marking new free space doesn't + * result in adding new extent entries or merging the new space + * with existing extent entries - the space ends up being marked + * in an existing bitmap that covers the new free space range. + * + * To get there, we need to reach the threshold defined set at + * cache->free_space_ctl->extents_thresh, which currently is + * 256 extents on a x86_64 system at least, and a few other + * conditions (check free_space_cache.c). Instead of making the + * test much longer and complicated, use a "use_bitmap" operation + * that forces use of bitmaps as soon as we have at least 1 + * extent entry. + */ + use_bitmap_op = cache->free_space_ctl->op->use_bitmap; + cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + + /* + * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the first 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb - 256Kb, 128Mb - 128Kb[ + * [128Mb + 512Kb, 128Mb + 768Kb[ + */ + ret = btrfs_remove_free_space(cache, + 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered + * by the bitmap too, isn't marked as free either. + */ + if (test_check_exists(cache, 128 * 1024 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the right of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, + 4096); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb - 256Kb, 128Mb - 128Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 4Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb - 256Kb, 128Mb[ + * bitmap entry covering range: [128Mb, 128Mb + 768Kb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + test_msg("Cache free space is not 1Mb + 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 4096) { + test_msg("Cache free space is not 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 4096, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * Now test a similar scenario, but where our extent entry is located + * to the right of the bitmap entry, so that we can check that stealing + * space from a bitmap to the front of an extent entry works. + */ + + /* + * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ + ret = test_add_free_space_entry(cache, 0, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the last 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb + 128b, 128Mb + 256Kb[ + * [128Mb - 768Kb, 128Mb - 512Kb[ + */ + ret = btrfs_remove_free_space(cache, + 0, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 0, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb - 512Kb, 128Mb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the left of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb + 128Kb, 128Mb + 256Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 8Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb, 128Mb + 256Kb[ + * bitmap entry covering range: [128Mb - 768Kb, 128Mb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + test_msg("Cache free space is not 1Mb + 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 8192) { + test_msg("Cache free space is not 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 8192, 0, + &max_extent_size); + if (offset != (32 * 1024 * 1024)) { + test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; @@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void) ret = test_bitmaps_and_extents(cache); if (ret) goto out; + + ret = test_steal_space_from_bitmap_to_extent(cache); out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5f379affdf23..dcaae3616728 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -218,7 +218,6 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -387,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, int ret; /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -409,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (num_items > 0 && root != root->fs_info->chunk_root) { if (root->fs_info->quota_enabled && is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->leafsize; + qgroup_reserved = num_items * root->nodesize; ret = btrfs_qgroup_reserve(root, qgroup_reserved); if (ret) return ERR_PTR(ret); @@ -419,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, /* * Do the reservation for the relocation root creation */ - if (unlikely(need_reserve_reloc_root(root))) { + if (need_reserve_reloc_root(root)) { num_bytes += root->nodesize; reloc_reserved = true; } @@ -610,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) if (transid <= root->fs_info->last_trans_committed) goto out; - ret = -EINVAL; /* find specified transaction */ spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { @@ -626,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } } spin_unlock(&root->fs_info->trans_lock); - /* The specified transaction doesn't exist */ - if (!cur_trans) + + /* + * The specified transaction doesn't exist, or we + * raced with btrfs_commit_transaction + */ + if (!cur_trans) { + if (transid > root->fs_info->last_trans_committed) + ret = -EINVAL; goto out; + } } else { /* find newest transaction that is committing | committed */ spin_lock(&root->fs_info->trans_lock); @@ -852,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); + bool errors = false; while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -865,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, } if (err) werr = err; + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, + &btree_ino->runtime_flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, + &btree_ino->runtime_flags)) + errors = true; + } else { + if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR, + &btree_ino->runtime_flags)) + errors = true; + } + + if (errors && !werr) + werr = -EIO; + return werr; } @@ -1612,27 +1639,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } -static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = btrfs_run_delayed_items(trans, root); - if (ret) - return ret; - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(trans, root, 1); - - return ret; -} - static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) @@ -1651,15 +1657,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); int ret; - ret = btrfs_run_ordered_operations(trans, root, 0); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - return ret; - } - /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -1740,7 +1740,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; @@ -1748,7 +1748,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; @@ -1897,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); + btrfs_update_commit_device_size(root->fs_info); + btrfs_update_commit_device_bytes_used(root, cur_trans); + + clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2010,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) ret = btrfs_drop_snapshot(root, NULL, 0, 0); else ret = btrfs_drop_snapshot(root, NULL, 1, 0); - /* - * If we encounter a transaction abort during snapshot cleaning, we - * don't want to crash here - */ + return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7dd558ed0716..d8f40e1a5d2d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -55,7 +55,6 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head ordered_operations; struct list_head pending_chunks; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; @@ -80,7 +79,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB 1 +#define BTRFS_SEND_TRANS_STUB ((void *)1) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9e1f2cd5e67a..1475979e5718 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,11 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -1496,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -1635,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, found_key.type == log_key.type && found_key.offset == log_key.offset && btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + update_size = false; goto out; } @@ -2155,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); + blocksize = root->nodesize; parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); @@ -2981,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, min_key.type = key_type; min_key.offset = min_offset; - path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* @@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; bool has_extents = false; - bool need_find_last_extent = (*last_extent == 0); + bool need_find_last_extent = true; bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, */ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { has_extents = true; - if (need_find_last_extent && - first_key.objectid == (u64)-1) + if (first_key.objectid == (u64)-1) first_key = ins_keys[i]; } else { need_find_last_extent = false; @@ -3363,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, @@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!has_extents) return ret; + if (need_find_last_extent && *last_extent == first_key.offset) { + /* + * We don't have any leafs between our current one and the one + * we processed before that can have file extent items for our + * inode (and have a generation number smaller than our current + * transaction id). + */ + need_find_last_extent = false; + } + /* * Because we use btrfs_search_forward we could skip leaves that were * not modified and then assume *last_extent is valid when it really @@ -3537,7 +3548,7 @@ fill_holes: 0, 0); if (ret) break; - *last_extent = offset + len; + *last_extent = extent_end; } /* * Need to let the callers know we dropped the path so they should @@ -3562,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int log_one_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct list_head *logged_list) +static int wait_ordered_extents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + const struct extent_map *em, + const struct list_head *logged_list, + bool *ordered_io_error) { - struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; struct btrfs_ordered_extent *ordered; - struct list_head ordered_sums; - struct btrfs_map_token token; - struct btrfs_key key; + struct btrfs_root *log = root->log_root; u64 mod_start = em->mod_start; u64 mod_len = em->mod_len; + const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; - u64 extent_offset = em->start - em->orig_start; - u64 block_len; - int ret; - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - int extent_inserted = 0; - - INIT_LIST_HEAD(&ordered_sums); - btrfs_init_map_token(&token); - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); - if (ret) - return ret; - - if (!extent_inserted) { - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - - ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); - if (ret) - return ret; - } - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, - &token); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - skip_csum = true; - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC, - &token); - } else { - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG, - &token); - if (em->block_start == EXTENT_MAP_HOLE) - skip_csum = true; - } - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start, - &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, - &token); - } + LIST_HEAD(ordered_sums); + int ret = 0; - btrfs_set_token_file_extent_offset(leaf, fi, - em->start - em->orig_start, - &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); - btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, - &token); - btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); - btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); - btrfs_mark_buffer_dirty(leaf); + *ordered_io_error = false; - btrfs_release_path(path); - if (ret) { - return ret; - } - - if (skip_csum) + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) return 0; /* - * First check and see if our csums are on our outstanding ordered - * extents. + * Wait far any ordered extent that covers our extent map. If it + * finishes without an error, first check and see if our csums are on + * our outstanding ordered extents. */ list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; @@ -3674,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans, mod_start + mod_len <= ordered->file_offset) continue; + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + const u64 start = ordered->file_offset; + const u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(ordered->inode != inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + + wait_event(ordered->wait, + (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || + test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + *ordered_io_error = true; + break; + } /* * We are going to copy all the csums on this ordered extent, so * go ahead and adjust mod_start and mod_len in case this @@ -3705,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans, } } + if (skip_csum) + continue; + /* * To keep us from looping for the above case of an ordered * extent that falls inside of the logged extent. @@ -3722,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans, list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) - goto unlocked; + break; } - } -unlocked: - if (!mod_len || ret) + if (*ordered_io_error || !mod_len || ret || skip_csum) return ret; if (em->compress_type) { csum_offset = 0; - csum_len = block_len; + csum_len = max(em->block_len, em->orig_block_len); } else { csum_offset = mod_start - em->start; csum_len = mod_len; @@ -3760,11 +3716,106 @@ unlocked: return ret; } +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + const struct extent_map *em, + struct btrfs_path *path, + const struct list_head *logged_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_map_token token; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + int extent_inserted = 0; + bool ordered_io_err = false; + + ret = wait_ordered_extents(trans, inode, root, em, logged_list, + &ordered_io_err); + if (ret) + return ret; + + if (ordered_io_err) { + ctx->io_err = -EIO; + return 0; + } + + btrfs_init_map_token(&token); + + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); + if (ret) + return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); + else + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct list_head *logged_list) + struct list_head *logged_list, + struct btrfs_log_ctx *ctx) { struct extent_map *em, *n; struct list_head extents; @@ -3822,7 +3873,8 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list); + ret = log_one_extent(trans, inode, root, em, path, logged_list, + ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3849,8 +3901,11 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3867,6 +3922,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; path = btrfs_alloc_path(); if (!path) @@ -3950,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = ret; goto out_unlock; } - path->keep_locks = 1; while (1) { ins_nr = 0; @@ -3980,7 +4035,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } if (ret) { + } + if (ret) { ins_nr = 0; btrfs_release_path(path); continue; @@ -4034,19 +4090,41 @@ log_extents: btrfs_release_path(dst_path); if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list); + &logged_list, ctx); if (ret) { err = ret; goto out_unlock; } } else if (inode_only == LOG_INODE_ALL) { - struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; - write_lock(&tree->lock); - list_for_each_entry_safe(em, n, &tree->modified_extents, list) - list_del_init(&em->list); - write_unlock(&tree->lock); + write_lock(&em_tree->lock); + /* + * We can't just remove every em if we're called for a ranged + * fsync - that is, one that doesn't cover the whole possible + * file range (0 to LLONG_MAX). This is because we can have + * em's that fall outside the range we're logging and therefore + * their ordered operations haven't completed yet + * (btrfs_finish_ordered_io() not invoked yet). This means we + * didn't get their respective file extent item in the fs/subvol + * tree yet, and need to let the next fast fsync (one which + * consults the list of modified extent maps) find the em so + * that it logs a matching file extent item and waits for the + * respective ordered operation to complete (if it's still + * running). + * + * Removing every em outside the range we're logging would make + * the next fast fsync not log their matching file extent items, + * therefore making us lose data after a log replay. + */ + list_for_each_entry_safe(em, n, &em_tree->modified_extents, + list) { + const u64 mod_end = em->mod_start + em->mod_len - 1; + + if (em->mod_start >= start && mod_end <= end) + list_del_init(&em->list); + } + write_unlock(&em_tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { @@ -4056,6 +4134,7 @@ log_extents: goto out_unlock; } } + BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: @@ -4152,7 +4231,10 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only, + struct dentry *parent, + const loff_t start, + const loff_t end, + int exists_only, struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; @@ -4198,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); if (ret) goto end_trans; @@ -4226,7 +4308,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, + 0, LLONG_MAX, ctx); if (ret) goto end_trans; } @@ -4260,13 +4343,15 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, - 0, ctx); + start, end, 0, ctx); dput(parent); return ret; @@ -4316,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); @@ -4503,6 +4588,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); + return btrfs_log_inode_parent(trans, root, inode, parent, 0, + LLONG_MAX, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7f5b41bd5373..154990c26dcb 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -28,6 +28,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; + int io_err; struct list_head list; }; @@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) { ctx->log_ret = 0; ctx->log_transid = 0; + ctx->io_err = 0; INIT_LIST_HEAD(&ctx->list); } @@ -59,6 +61,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 7f78cbf5cf41..4c29db604bbe 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); + +/* just like ulist_add_merge() but take a pointer for the aux data */ +static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, + void **old_aux, gfp_t gfp_mask) +{ +#if BITS_PER_LONG == 32 + u64 old64 = (uintptr_t)*old_aux; + int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask); + *old_aux = (void *)((uintptr_t)old64); + return ret; +#else + return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask); +#endif +} + struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index f6a4c03ee7d8..778282944530 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.offset = 0; again_search_slot: - path->keep_locks = 1; ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cb82f62cb7c..d47289c715c8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static DEFINE_MUTEX(uuid_mutex); +DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); static void lock_chunks(struct btrfs_root *root) @@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->list); @@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->resized_list); spin_lock_init(&dev->io_lock); spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); + atomic_set(&dev->dev_stats_ccnt, 0); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); @@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path, return PTR_ERR(fs_devices); list_add(&fs_devices->list, &fs_uuids); - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; device = NULL; } else { device = __find_device(&fs_devices->devices, devid, disk_super->dev_item.uuid); } + if (!device) { if (fs_devices->opened) return -EBUSY; @@ -508,6 +510,43 @@ static noinline int device_list_add(const char *path, ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { + /* + * When FS is already mounted. + * 1. If you are here and if the device->name is NULL that + * means this device was missing at time of FS mount. + * 2. If you are here and if the device->name is different + * from 'path' that means either + * a. The same device disappeared and reappeared with + * different name. or + * b. The missing-disk-which-was-replaced, has + * reappeared now. + * + * We must allow 1 and 2a above. But 2b would be a spurious + * and unintentional. + * + * Further in case of 1 and 2a above, the disk at 'path' + * would have missed some transaction when it was away and + * in case of 2a the stale bdev has to be updated as well. + * 2b must not be allowed at all time. + */ + + /* + * For now, we do allow update to btrfs_fs_device through the + * btrfs dev scan cli after FS has been mounted. We're still + * tracking a problem where systems fail mount by subvolume id + * when we reject replacement on a mounted FS. + */ + if (!fs_devices->opened && found_transid < device->generation) { + /* + * That is if the FS is _not_ mounted and if you + * are here, that means there is more than one + * disk with same uuid and devid.We keep the one + * with larger generation number or the last-in if + * generation are equal. + */ + return -EEXIST; + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; @@ -519,10 +558,15 @@ static noinline int device_list_add(const char *path, } } - if (found_transid > fs_devices->latest_trans) { - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; - } + /* + * Unmount does not free the btrfs_device struct but would zero + * generation along with most of the other members. So just update + * it back. We need it to pick the disk with largest generation + * (as above). + */ + if (!fs_devices->opened) + device->generation = found_transid; + *fs_devices_ret = fs_devices; return ret; @@ -538,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->latest_devid = orig->latest_devid; - fs_devices->latest_trans = orig->latest_trans; + mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; /* We have held the volume lock, it is safe to get the devices. */ @@ -568,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } + mutex_unlock(&orig->device_list_mutex); return fs_devices; error: + mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -578,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; - - struct block_device *latest_bdev = NULL; - u64 latest_devid = 0; - u64 latest_transid = 0; + struct btrfs_device *latest_dev = NULL; mutex_lock(&uuid_mutex); again: @@ -589,11 +631,9 @@ again: list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) { if (!device->is_tgtdev_for_dev_replace && - (!latest_transid || - device->generation > latest_transid)) { - latest_devid = device->devid; - latest_transid = device->generation; - latest_bdev = device->bdev; + (!latest_dev || + device->generation > latest_dev->generation)) { + latest_dev = device; } continue; } @@ -635,9 +675,7 @@ again: goto again; } - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; + fs_devices->latest_bdev = latest_dev->bdev; mutex_unlock(&uuid_mutex); } @@ -686,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->rw_devices--; } - if (device->can_discard) - fs_devices->num_can_discard--; if (device->missing) fs_devices->missing_devices--; @@ -752,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; - struct block_device *latest_bdev = NULL; + struct btrfs_device *latest_dev = NULL; struct buffer_head *bh; struct btrfs_super_block *disk_super; - u64 latest_devid = 0; - u64 latest_transid = 0; u64 devid; int seeding = 1; int ret = 0; @@ -784,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, goto error_brelse; device->generation = btrfs_super_generation(disk_super); - if (!latest_transid || device->generation > latest_transid) { - latest_devid = devid; - latest_transid = device->generation; - latest_bdev = bdev; - } + if (!latest_dev || + device->generation > latest_dev->generation) + latest_dev = device; if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { device->writeable = 0; @@ -798,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, } q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) { + if (blk_queue_discard(q)) device->can_discard = 1; - fs_devices->num_can_discard++; - } device->bdev = bdev; device->in_fs_metadata = 0; @@ -831,9 +861,7 @@ error_brelse: } fs_devices->seeding = seeding; fs_devices->opened = 1; - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; + fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; out: return ret; @@ -1007,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); @@ -1159,7 +1187,7 @@ again: if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; if (key.offset > search_start) { @@ -1238,7 +1266,7 @@ out: static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 start) + u64 start, u64 *dev_extent_len) { int ret; struct btrfs_path *path; @@ -1280,13 +1308,8 @@ again: goto out; } - if (device->bytes_used > 0) { - u64 len = btrfs_dev_extent_length(leaf, extent); - device->bytes_used -= len; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += len; - spin_unlock(&root->fs_info->free_chunk_lock); - } + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); if (ret) { btrfs_error(root->fs_info, ret, @@ -1436,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); btrfs_set_device_bandwidth(leaf, dev_item, 0); @@ -1493,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; - lock_chunks(root); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1509,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, goto out; out: btrfs_free_path(path); - unlock_chunks(root); btrfs_commit_transaction(trans, root); return ret; } @@ -1625,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->writeable) { lock_chunks(root); list_del_init(&device->dev_alloc_list); + device->fs_devices->rw_devices--; unlock_chunks(root); - root->fs_info->fs_devices->rw_devices--; clear_super = true; } @@ -1645,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space = device->total_bytes - - device->bytes_used; - spin_unlock(&root->fs_info->free_chunk_lock); - device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root->fs_info, device); @@ -1671,7 +1689,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->total_devices--; if (device->missing) - root->fs_info->fs_devices->missing_devices--; + device->fs_devices->missing_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); @@ -1703,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - lock_chunks(root); __btrfs_close_devices(cur_devices); - unlock_chunks(root); free_fs_devices(cur_devices); } @@ -1778,8 +1794,8 @@ error_undo: lock_chunks(root); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); + device->fs_devices->rw_devices++; unlock_chunks(root); - root->fs_info->fs_devices->rw_devices++; } goto error_brelse; } @@ -1787,25 +1803,57 @@ error_undo: void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev) { + struct btrfs_fs_devices *fs_devices; + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + /* + * in case of fs with no seed, srcdev->fs_devices will point + * to fs_devices of fs_info. However when the dev being replaced is + * a seed dev it will point to the seed's local fs_devices. In short + * srcdev will have its correct fs_devices in both the cases. + */ + fs_devices = srcdev->fs_devices; + list_del_rcu(&srcdev->dev_list); list_del_rcu(&srcdev->dev_alloc_list); - fs_info->fs_devices->num_devices--; - if (srcdev->missing) { - fs_info->fs_devices->missing_devices--; - fs_info->fs_devices->rw_devices++; - } - if (srcdev->can_discard) - fs_info->fs_devices->num_can_discard--; - if (srcdev->bdev) { - fs_info->fs_devices->open_devices--; + fs_devices->num_devices--; + if (srcdev->missing) + fs_devices->missing_devices--; - /* zero out the old super */ + if (srcdev->writeable) { + fs_devices->rw_devices--; + /* zero out the old super if it is writable */ btrfs_scratch_superblock(srcdev); } + if (srcdev->bdev) + fs_devices->open_devices--; + call_rcu(&srcdev->rcu, free_device); + + /* + * unless fs_devices is seed fs, num_devices shouldn't go + * zero + */ + BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); + + /* if this is no devs we rather delete the fs_devices */ + if (!fs_devices->num_devices) { + struct btrfs_fs_devices *tmp_fs_devices; + + tmp_fs_devices = fs_info->fs_devices; + while (tmp_fs_devices) { + if (tmp_fs_devices->seed == fs_devices) { + tmp_fs_devices->seed = fs_devices->seed; + break; + } + tmp_fs_devices = tmp_fs_devices->seed; + } + fs_devices->seed = NULL; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, @@ -1813,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *next_device; + mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); if (tgtdev->bdev) { @@ -1820,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; - if (tgtdev->can_discard) - fs_info->fs_devices->num_can_discard++; next_device = list_entry(fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); @@ -1834,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, call_rcu(&tgtdev->rcu, free_device); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -1932,15 +1980,18 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); + list_for_each_entry(device, &seed_devices->devices, dev_list) + device->fs_devices = seed_devices; + lock_chunks(root); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - list_for_each_entry(device, &seed_devices->devices, dev_list) { - device->fs_devices = seed_devices; - } + unlock_chunks(root); fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; + fs_devices->missing_devices = 0; + fs_devices->rotating = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2039,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct list_head *devices; struct super_block *sb = root->fs_info->sb; struct rcu_string *name; - u64 total_bytes; + u64 tmp; int seeding_dev = 0; int ret = 0; @@ -2095,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - lock_chunks(root); - q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; @@ -2107,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2124,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->fs_devices = root->fs_info->fs_devices; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + lock_chunks(root); list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -2131,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_devices++; - if (device->can_discard) - root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; spin_lock(&root->fs_info->free_chunk_lock); @@ -2142,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + tmp = btrfs_super_total_bytes(root->fs_info->super_copy); btrfs_set_super_total_bytes(root->fs_info->super_copy, - total_bytes + device->total_bytes); + tmp + device->total_bytes); - total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + tmp = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, - total_bytes + 1); + tmp + 1); /* add sysfs device entry */ btrfs_kobj_add_device(root->fs_info, device); + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + lock_chunks(root); ret = init_first_rw_device(trans, root, device); + unlock_chunks(root); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error_trans; } + } + + ret = btrfs_add_device(trans, root, device); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + ret = btrfs_finish_sprout(trans, root); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2175,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fsid); if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) goto error_trans; - } else { - ret = btrfs_add_device(trans, root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error_trans; - } } - /* - * we've got more storage, clear any full flags on the space - * infos - */ - btrfs_clear_space_info_full(root->fs_info); - - unlock_chunks(root); root->fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); ret = btrfs_commit_transaction(trans, root); @@ -2221,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return ret; error_trans: - unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); btrfs_kobj_rm_device(root->fs_info, device); @@ -2236,6 +2290,7 @@ error: } int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out) { struct request_queue *q; @@ -2248,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; + } bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, fs_info->bdev_holder); - if (IS_ERR(bdev)) + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev); + } filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { + btrfs_err(fs_info, "target device is in the filesystem!"); ret = -EEXIST; goto error; } } + + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + device = btrfs_alloc_device(NULL, &devid, NULL); if (IS_ERR(device)) { ret = PTR_ERR(device); @@ -2289,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->io_width = root->sectorsize; device->io_align = root->sectorsize; device->sector_size = root->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); - device->disk_total_bytes = device->total_bytes; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + ASSERT(list_empty(&srcdev->resized_list)); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2302,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; - if (device->can_discard) - fs_info->fs_devices->num_can_discard++; mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); *device_out = device; @@ -2362,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_mark_buffer_dirty(leaf); out: @@ -2371,40 +2444,44 @@ out: return ret; } -static int __btrfs_grow_device(struct btrfs_trans_handle *trans, +int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = device->dev_root->fs_info->super_copy; - u64 old_total = btrfs_super_total_bytes(super_copy); - u64 diff = new_size - device->total_bytes; + struct btrfs_fs_devices *fs_devices; + u64 old_total; + u64 diff; if (!device->writeable) return -EACCES; + + lock_chunks(device->dev_root); + old_total = btrfs_super_total_bytes(super_copy); + diff = new_size - device->total_bytes; + if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) + device->is_tgtdev_for_dev_replace) { + unlock_chunks(device->dev_root); return -EINVAL; + } + + fs_devices = device->dev_root->fs_info->fs_devices; btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; - device->total_bytes = new_size; - device->disk_total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); + btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->dev_root->fs_info); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &fs_devices->resized_devices); + unlock_chunks(device->dev_root); return btrfs_update_device(trans, device); } -int btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size) -{ - int ret; - lock_chunks(device->dev_root); - ret = __btrfs_grow_device(trans, device, new_size); - unlock_chunks(device->dev_root); - return ret; -} - static int btrfs_free_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 chunk_tree, u64 chunk_objectid, @@ -2456,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 u32 cur; struct btrfs_key key; + lock_chunks(root); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -2485,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 cur += len; } } + unlock_chunks(root); return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset) { struct extent_map_tree *em_tree; - struct btrfs_root *extent_root; - struct btrfs_trans_handle *trans; struct extent_map *em; + struct btrfs_root *extent_root = root->fs_info->extent_root; struct map_lookup *map; - int ret; - int i; + u64 dev_extent_len = 0; + u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + u64 chunk_tree = root->fs_info->chunk_root->objectid; + int i, ret = 0; + /* Just in case */ root = root->fs_info->chunk_root; - extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; - ret = btrfs_can_relocate(extent_root, chunk_offset); - if (ret) - return -ENOSPC; - - /* step one, relocate all the extents inside this chunk */ - ret = btrfs_relocate_block_group(extent_root, chunk_offset); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); - return ret; - } - - lock_chunks(root); - - /* - * step two, delete the device extents and the - * chunk tree entries - */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start > chunk_offset || - em->start + em->len < chunk_offset); + if (!em || em->start > chunk_offset || + em->start + em->len < chunk_offset) { + /* + * This is a logic error, but we don't want to just rely on the + * user having built with ASSERT enabled, so if ASSERT doens't + * do anything we still error out. + */ + ASSERT(0); + if (em) + free_extent_map(em); + return -EINVAL; + } map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, - map->stripes[i].physical); - BUG_ON(ret); + struct btrfs_device *device = map->stripes[i].dev; + ret = btrfs_free_dev_extent(trans, device, + map->stripes[i].physical, + &dev_extent_len); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + if (device->bytes_used > 0) { + lock_chunks(root); + btrfs_device_set_bytes_used(device, + device->bytes_used - dev_extent_len); + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += dev_extent_len; + spin_unlock(&root->fs_info->free_chunk_lock); + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); + } if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } } ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, chunk_offset); - - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } trace_btrfs_chunk_free(root, map, chunk_offset, em->len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); @@ -2565,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* once for the tree */ free_extent_map(em); +out: /* once for us */ free_extent_map(em); + return ret; +} - unlock_chunks(root); +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + int ret; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + ret = btrfs_remove_chunk(trans, root, chunk_offset); btrfs_end_transaction(trans, root); - return 0; + return ret; } static int btrfs_relocate_sys_chunks(struct btrfs_root *root) @@ -2623,8 +2751,8 @@ again: found_key.offset); if (ret == -ENOSPC) failed++; - else if (ret) - BUG(); + else + BUG_ON(ret); } if (found_key.offset == 0) @@ -3031,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free || + btrfs_device_get_total_bytes(device) - + btrfs_device_get_bytes_used(device) > size_to_free || device->is_tgtdev_for_dev_replace) continue; @@ -3590,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data) max_key.type = BTRFS_ROOT_ITEM_KEY; max_key.offset = (u64)-1; - path->keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, &key, path, 0); if (ret) { @@ -3843,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); - u64 old_size = device->total_bytes; - u64 diff = device->total_bytes - new_size; + u64 old_size = btrfs_device_get_total_bytes(device); + u64 diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -3857,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); - device->total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -3923,7 +4050,7 @@ again: ret = -ENOSPC; lock_chunks(root); - device->total_bytes = old_size; + btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -3941,18 +4068,17 @@ again: } lock_chunks(root); + btrfs_device_set_disk_total_bytes(device, new_size); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &root->fs_info->fs_devices->resized_devices); - device->disk_total_bytes = new_size; - /* Now btrfs_update_device() will change the on-disk size. */ - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); + + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); btrfs_end_transaction(trans, root); done: btrfs_free_path(path); @@ -3968,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u32 array_size; u8 *ptr; + lock_chunks(root); array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) - > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + unlock_chunks(root); return -EFBIG; + } ptr = super_copy->sys_chunk_array + array_size; btrfs_cpu_key_to_disk(&disk_key, key); @@ -3980,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + unlock_chunks(root); + return 0; } @@ -4349,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret) goto error_del_extent; + for (i = 0; i < map->num_stripes; i++) { + num_bytes = map->stripes[i].dev->bytes_used + stripe_size; + btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); + } + + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + free_extent_map(em); check_raid56_incompat_flag(extent_root->fs_info, type); @@ -4420,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; - device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); if (ret) goto out; @@ -4433,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - spin_lock(&extent_root->fs_info->free_chunk_lock); - extent_root->fs_info->free_chunk_space -= (stripe_size * - map->num_stripes); - spin_unlock(&extent_root->fs_info->free_chunk_lock); - stripe = &chunk->stripe; for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; @@ -4517,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, alloc_profile); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; + return ret; +} + +static inline int btrfs_chunk_max_errors(struct map_lookup *map) +{ + int max_errors; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; + } else { + max_errors = 0; } - ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - btrfs_abort_transaction(trans, root, ret); -out: - return ret; + return max_errors; } int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) @@ -4535,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) struct map_lookup *map; struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; int readonly = 0; + int miss_ndevs = 0; int i; read_lock(&map_tree->map_tree.lock); @@ -4543,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - if (btrfs_test_opt(root, DEGRADED)) { - free_extent_map(em); - return 0; - } - map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev->missing) { + miss_ndevs++; + continue; + } + if (!map->stripes[i].dev->writeable) { readonly = 1; - break; + goto end; } } + + /* + * If the number of missing devices is larger than max errors, + * we can not write the data into that chunk successfully, so + * set it readonly. + */ + if (miss_ndevs > btrfs_chunk_max_errors(map)) + readonly = 1; +end: free_extent_map(em); return readonly; } @@ -4955,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_index = do_div(stripe_nr, map->num_stripes); + if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; @@ -5058,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ tmp = stripe_nr + stripe_index; stripe_index = do_div(tmp, map->num_stripes); + if (!(rw & (REQ_WRITE | REQ_DISCARD | + REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + mirror_num = 1; } } else { /* @@ -5165,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } - } + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { @@ -5557,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - name->str, dev->devid, bio->bi_size); + (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); } #endif @@ -5736,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, } static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) @@ -5800,7 +5951,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); + btrfs_init_work(&dev->work, btrfs_submit_helper, + pending_bios_fn, NULL, NULL); return dev; } @@ -5875,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } if (!map->stripes[i].dev) { map->stripes[i].dev = - add_missing_dev(root, devid, uuid); + add_missing_dev(root, root->fs_info->fs_devices, + devid, uuid); if (!map->stripes[i].dev) { free_extent_map(em); return -EIO; @@ -5902,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); @@ -5914,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf, read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } -static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, + u8 *fsid) { struct btrfs_fs_devices *fs_devices; int ret; @@ -5923,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { - if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { - ret = 0; - goto out; - } + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) + return fs_devices; + fs_devices = fs_devices->seed; } fs_devices = find_fsid(fsid); if (!fs_devices) { - ret = -ENOENT; - goto out; + if (!btrfs_test_opt(root, DEGRADED)) + return ERR_PTR(-ENOENT); + + fs_devices = alloc_fs_devices(fsid); + if (IS_ERR(fs_devices)) + return fs_devices; + + fs_devices->seeding = 1; + fs_devices->opened = 1; + return fs_devices; } fs_devices = clone_fs_devices(fs_devices); - if (IS_ERR(fs_devices)) { - ret = PTR_ERR(fs_devices); - goto out; - } + if (IS_ERR(fs_devices)) + return fs_devices; ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); + fs_devices = ERR_PTR(ret); goto out; } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); - ret = -EINVAL; + fs_devices = ERR_PTR(-EINVAL); goto out; } fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - return ret; + return fs_devices; } static int read_one_dev(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; u64 devid; int ret; @@ -5979,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root, BTRFS_UUID_SIZE); if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { - ret = open_seed_devices(root, fs_uuid); - if (ret && !btrfs_test_opt(root, DEGRADED)) - return ret; + fs_devices = open_seed_devices(root, fs_uuid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); } device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); - if (!device || !device->bdev) { + if (!device) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - if (!device) { - btrfs_warn(root->fs_info, "devid %llu missing", devid); - device = add_missing_dev(root, devid, dev_uuid); - if (!device) - return -ENOMEM; - } else if (!device->missing) { + btrfs_warn(root->fs_info, "devid %llu missing", devid); + device = add_missing_dev(root, fs_devices, devid, dev_uuid); + if (!device) + return -ENOMEM; + } else { + if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if(!device->bdev && !device->missing) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. * device->bdev is NULL, and so we have to set * device->missing to one here */ - root->fs_info->fs_devices->missing_devices++; + device->fs_devices->missing_devices++; device->missing = 1; } + + /* Move the device to its own fs_devices */ + if (device->fs_devices != fs_devices) { + ASSERT(device->missing); + + list_move(&device->dev_list, &fs_devices->devices); + device->fs_devices->num_devices--; + fs_devices->num_devices++; + + device->fs_devices->missing_devices--; + fs_devices->missing_devices++; + + device->fs_devices = fs_devices; + } } if (device->fs_devices != root->fs_info->fs_devices) { @@ -6319,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + int stats_cnt; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !device->dev_stats_dirty) + if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) continue; + stats_cnt = atomic_read(&device->dev_stats_ccnt); ret = update_dev_stat_item(trans, dev_root, device); if (!ret) - device->dev_stats_dirty = 0; + atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); @@ -6427,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device) return 0; } + +/* + * Update the size of all devices, which is used for writing out the + * super blocks. + */ +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *curr, *next; + + if (list_empty(&fs_devices->resized_devices)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + lock_chunks(fs_info->dev_root); + list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, + resized_list) { + list_del_init(&curr->resized_list); + curr->commit_total_bytes = curr->disk_total_bytes; + } + unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_devices->device_list_mutex); +} + +/* Must be invoked during the transaction commit */ +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + int i; + + if (list_empty(&transaction->pending_chunks)) + return; + + /* In order to kick the device replace finish process */ + lock_chunks(root); + list_for_each_entry(em, &transaction->pending_chunks, list) { + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; + } + } + unlock_chunks(root); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2aaa00c47816..08980fa23039 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,6 +24,8 @@ #include <linux/btrfs.h> #include "async-thread.h" +extern struct mutex uuid_mutex; + #define BTRFS_STRIPE_LEN (64 * 1024) struct buffer_head; @@ -32,41 +34,59 @@ struct btrfs_pending_bios { struct bio *tail; }; +/* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include <linux/seqlock.h> +#define __BTRFS_NEED_DEVICE_DATA_ORDERED +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) +#else +#define btrfs_device_data_ordered_init(device) do { } while (0) +#endif + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + struct rcu_string *name; + + u64 generation; + + spinlock_t io_lock ____cacheline_aligned; + int running_pending; /* regular prio bios */ struct btrfs_pending_bios pending_bios; /* WRITE_SYNC bios */ struct btrfs_pending_bios pending_sync_bios; - u64 generation; - int running_pending; + struct block_device *bdev; + + /* the mode sent to blkdev_get */ + fmode_t mode; + int writeable; int in_fs_metadata; int missing; int can_discard; int is_tgtdev_for_dev_replace; - spinlock_t io_lock; - /* the mode sent to blkdev_get */ - fmode_t mode; - - struct block_device *bdev; - - - struct rcu_string *name; +#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED + seqcount_t data_seqcount; +#endif /* the internal btrfs device id */ u64 devid; - /* size of the device */ + /* size of the device in memory */ u64 total_bytes; - /* size of the disk */ + /* size of the device on disk */ u64 disk_total_bytes; /* bytes used */ @@ -83,10 +103,26 @@ struct btrfs_device { /* minimal io size for this device */ u32 sector_size; - /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by device_list_mutex + */ + u64 commit_total_bytes; + + /* bytes used on the current transaction */ + u64 commit_bytes_used; + /* + * used to manage the device which is resized + * + * It is protected by chunk_lock. + */ + struct list_head resized_list; + /* for sending down flush barriers */ int nobarriers; struct bio *flush_bio; @@ -107,26 +143,90 @@ struct btrfs_device { struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; - /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; - int dev_stats_dirty; /* counters need to be written to disk */ + + /* Counter to record the change of device stats */ + atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; +/* + * If we read those variants at the context of their own lock, we needn't + * use the following helpers, reading them directly is safe. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + unsigned int seq; \ + \ + do { \ + seq = read_seqcount_begin(&dev->data_seqcount); \ + size = dev->name; \ + } while (read_seqcount_retry(&dev->data_seqcount, seq)); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + write_seqcount_begin(&dev->data_seqcount); \ + dev->name = size; \ + write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ +} +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + \ + preempt_disable(); \ + size = dev->name; \ + preempt_enable(); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + dev->name = size; \ + preempt_enable(); \ +} +#else +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + return dev->name; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + dev->name = size; \ +} +#endif + +BTRFS_DEVICE_GETSET_FUNCS(total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(bytes_used); + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent copy of the super */ - u64 latest_devid; - u64 latest_trans; u64 num_devices; u64 open_devices; u64 rw_devices; u64 missing_devices; u64 total_rw_bytes; - u64 num_can_discard; u64 total_devices; struct block_device *latest_bdev; @@ -139,6 +239,7 @@ struct btrfs_fs_devices { struct mutex device_list_mutex; struct list_head devices; + struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; struct list_head list; @@ -167,8 +268,9 @@ struct btrfs_fs_devices { */ typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { - unsigned long mirror_num; - unsigned long stripe_index; + unsigned int mirror_num; + unsigned int stripe_index; + u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; @@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); @@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 chunk_offset, u64 chunk_size); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset); + +static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) +{ + return atomic_read(&dev->dev_stats_ccnt); +} + static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { atomic_inc(dev->dev_stat_values + index); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); } static inline int btrfs_dev_stat_read(struct btrfs_device *dev, @@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, int ret; ret = atomic_xchg(dev->dev_stat_values + index, 0); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); return ret; } @@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, int index, unsigned long val) { atomic_set(dev->dev_stat_values + index, val); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); } static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, @@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, { btrfs_dev_stat_set(dev, index, 0); } + +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ad8328d797ea..dcf20131fbe4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) * first xattr that we find and walk forward */ key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; path = btrfs_alloc_path(); @@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + if (found_key.type != BTRFS_XATTR_ITEM_KEY) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b67d8fc81277..759fa4e2de8f 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -33,8 +33,7 @@ #include "compression.h" struct workspace { - z_stream inf_strm; - z_stream def_strm; + z_stream strm; char *buf; struct list_head list; }; @@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); + vfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws) static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; + int workspacesize; workspace = kzalloc(sizeof(*workspace), GFP_NOFS); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( - MAX_WBITS, MAX_MEM_LEVEL)); - workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + workspace->strm.workspace = vmalloc(workspacesize); workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->def_strm.workspace || - !workspace->inf_strm.workspace || !workspace->buf) + if (!workspace->strm.workspace || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); @@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { printk(KERN_WARNING "BTRFS: deflateInit failed\n"); ret = -EIO; goto out; } - workspace->def_strm.total_in = 0; - workspace->def_strm.total_out = 0; + workspace->strm.total_in = 0; + workspace->strm.total_out = 0; in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); data_in = kmap(in_page); @@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws, pages[0] = out_page; nr_pages = 1; - workspace->def_strm.next_in = data_in; - workspace->def_strm.next_out = cpage_out; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + workspace->strm.next_in = data_in; + workspace->strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE); - while (workspace->def_strm.total_in < len) { - ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + while (workspace->strm.total_in < len) { + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - zlib_deflateEnd(&workspace->def_strm); + zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; } /* we're making it bigger, give up */ - if (workspace->def_strm.total_in > 8192 && - workspace->def_strm.total_in < - workspace->def_strm.total_out) { + if (workspace->strm.total_in > 8192 && + workspace->strm.total_in < + workspace->strm.total_out) { ret = -E2BIG; goto out; } @@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws, * before the total_in so we will pull in a new page for * the stream end if required */ - if (workspace->def_strm.avail_out == 0) { + if (workspace->strm.avail_out == 0) { kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; @@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = cpage_out; } /* we're all done */ - if (workspace->def_strm.total_in >= len) + if (workspace->strm.total_in >= len) break; /* we've read in a full page, get a new one */ - if (workspace->def_strm.avail_in == 0) { - if (workspace->def_strm.total_out > max_out) + if (workspace->strm.avail_in == 0) { + if (workspace->strm.total_out > max_out) break; - bytes_left = len - workspace->def_strm.total_in; + bytes_left = len - workspace->strm.total_in; kunmap(in_page); page_cache_release(in_page); @@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws, in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); data_in = kmap(in_page); - workspace->def_strm.avail_in = min(bytes_left, + workspace->strm.avail_in = min(bytes_left, PAGE_CACHE_SIZE); - workspace->def_strm.next_in = data_in; + workspace->strm.next_in = data_in; } } - workspace->def_strm.avail_in = 0; - ret = zlib_deflate(&workspace->def_strm, Z_FINISH); - zlib_deflateEnd(&workspace->def_strm); + workspace->strm.avail_in = 0; + ret = zlib_deflate(&workspace->strm, Z_FINISH); + zlib_deflateEnd(&workspace->strm); if (ret != Z_STREAM_END) { ret = -EIO; goto out; } - if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; goto out; } ret = 0; - *total_out = workspace->def_strm.total_out; - *total_in = workspace->def_strm.total_in; + *total_out = workspace->strm.total_out; + *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; if (out_page) @@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, size_t total_out = 0; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); unsigned long buf_start; unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); - workspace->inf_strm.total_in = 0; + workspace->strm.next_in = data_in; + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->strm.total_in = 0; - workspace->inf_strm.total_out = 0; - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, !(((data_in[0]<<8) + data_in[1]) % 31)) { wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -EIO; } - while (workspace->inf_strm.total_in < srclen) { - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + while (workspace->strm.total_in < srclen) { + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; buf_start = total_out; - total_out = workspace->inf_strm.total_out; + total_out = workspace->strm.total_out; /* we didn't make progress in this inflate call, we're done */ if (buf_start == total_out) @@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, goto done; } - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; - if (workspace->inf_strm.avail_in == 0) { + if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap(pages_in[page_in_index]); page_in_index++; @@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, break; } data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - tmp = srclen - workspace->inf_strm.total_in; - workspace->inf_strm.avail_in = min(tmp, + workspace->strm.next_in = data_in; + tmp = srclen - workspace->strm.total_in; + workspace->strm.avail_in = min(tmp, PAGE_CACHE_SIZE); } } @@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, else ret = 0; done: - zlib_inflateEnd(&workspace->inf_strm); + zlib_inflateEnd(&workspace->strm); if (data_in) kunmap(pages_in[page_in_index]); return ret; @@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long total_out = 0; char *kaddr; - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = srclen; - workspace->inf_strm.total_in = 0; + workspace->strm.next_in = data_in; + workspace->strm.avail_in = srclen; + workspace->strm.total_in = 0; - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - workspace->inf_strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ if (srclen > 2 && !(data_in[1] & PRESET_DICT) && @@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, !(((data_in[0]<<8) + data_in[1]) % 31)) { wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -EIO; } @@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes; unsigned long pg_offset = 0; - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; buf_start = total_out; - total_out = workspace->inf_strm.total_out; + total_out = workspace->strm.total_out; if (total_out == buf_start) { ret = -EIO; @@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, pg_offset += bytes; bytes_left -= bytes; next: - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; } if (ret != Z_STREAM_END && bytes_left != 0) @@ -386,7 +383,7 @@ next: else ret = 0; - zlib_inflateEnd(&workspace->inf_strm); + zlib_inflateEnd(&workspace->strm); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index eba6e4f621ce..44c14a87750e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(touch_buffer); -static int sleep_on_buffer(void *word) -{ - io_schedule(); - return 0; -} - void __lock_buffer(struct buffer_head *bh) { - wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, - TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); @@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { - wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); @@ -1029,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, bh = page_buffers(page); if (bh->b_size == size) { end_block = init_page_buffers(page, bdev, - index << sizebits, size); + (sector_t)index << sizebits, + size); goto done; } if (!try_to_free_buffers(page)) @@ -1050,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, index << sizebits, size); + end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, + size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; @@ -1258,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) * a local interrupt disable for that. */ -#define BH_LRU_SIZE 8 +#define BH_LRU_SIZE 16 struct bh_lru { struct buffer_head *bhs[BH_LRU_SIZE]; @@ -2961,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) /* * This allows us to do IO even on the odd last sectors - * of a device, even if the bh block size is some multiple + * of a device, even if the block size is some multiple * of the physical sector size. * * We'll just truncate the bio to the size of the device, @@ -2971,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +void guard_bio_eod(int rw, struct bio *bio) { sector_t maxsector; - unsigned bytes; + struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned truncated_bytes; maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (!maxsector) @@ -2989,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) return; maxsector -= bio->bi_iter.bi_sector; - bytes = bio->bi_iter.bi_size; - if (likely((bytes >> 9) <= maxsector)) + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bh that straddles the device size! */ - bytes = maxsector << 9; + /* Uhhuh. We've got a bio that straddles the device size! */ + truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); /* Truncate the bio.. */ - bio->bi_iter.bi_size = bytes; - bio->bi_io_vec[0].bv_len = bytes; + bio->bi_iter.bi_size -= truncated_bytes; + bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ if ((rw & RW_MASK) == READ) { - void *kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); - kunmap_atomic(kaddr); - flush_dcache_page(bh->b_page); + zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + truncated_bytes); } } @@ -3046,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ - guard_bh_eod(rw, bio, bh); + guard_bio_eod(rw, bio); if (buffer_meta(bh)) rw |= REQ_META; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index d749731dc0ee..fbb08e97438d 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) cache->brun_percent < 100); if (*args) { - pr_err("'bind' command doesn't take an argument"); + pr_err("'bind' command doesn't take an argument\n"); return -EINVAL; } if (!cache->rootdirname) { - pr_err("No cache directory specified"); + pr_err("No cache directory specified\n"); return -EINVAL; } /* don't permit already bound caches to be re-bound */ if (test_bit(CACHEFILES_READY, &cache->flags)) { - pr_err("Cache already bound"); + pr_err("Cache already bound\n"); return -EBUSY; } @@ -248,7 +248,7 @@ error_open_root: kmem_cache_free(cachefiles_object_jar, fsdef); error_root_object: cachefiles_end_secure(cache, saved_cred); - pr_err("Failed to register: %d", ret); + pr_err("Failed to register: %d\n", ret); return ret; } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index b078d3081d6c..ce1b115dcc28 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file, static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, char *args) { - pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); + pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n"); return -EINVAL; } @@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - pr_err("Empty directory specified"); + pr_err("Empty directory specified\n"); return -EINVAL; } if (cache->rootdirname) { - pr_err("Second cache directory specified"); + pr_err("Second cache directory specified\n"); return -EEXIST; } @@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - pr_err("Empty security context specified"); + pr_err("Empty security context specified\n"); return -EINVAL; } if (cache->secctx) { - pr_err("Second security context specified"); + pr_err("Second security context specified\n"); return -EINVAL; } @@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - pr_err("Empty tag specified"); + pr_err("Empty tag specified\n"); return -EINVAL; } @@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - pr_err("cull applied to unready cache"); + pr_err("cull applied to unready cache\n"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - pr_err("cull applied to dead cache"); + pr_err("cull applied to dead cache\n"); return -EIO; } @@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - pr_err("cull command requires dirfd to be a directory"); + pr_err("cull command requires dirfd to be a directory\n"); return -ENOTDIR; inval: - pr_err("cull command requires dirfd and filename"); + pr_err("cull command requires dirfd and filename\n"); return -EINVAL; } @@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) return 0; inval: - pr_err("debug command requires mask"); + pr_err("debug command requires mask\n"); return -EINVAL; } @@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - pr_err("inuse applied to unready cache"); + pr_err("inuse applied to unready cache\n"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - pr_err("inuse applied to dead cache"); + pr_err("inuse applied to dead cache\n"); return -EIO; } @@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - pr_err("inuse command requires dirfd to be a directory"); + pr_err("inuse command requires dirfd to be a directory\n"); return -ENOTDIR; inval: - pr_err("inuse command requires dirfd and filename"); + pr_err("inuse command requires dirfd and filename\n"); return -EINVAL; } diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 3d50998abf57..8c52472d2efa 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, #define cachefiles_io_error(___cache, FMT, ...) \ do { \ - pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ + pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ fscache_io_error(&(___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ } while (0) diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 180edfb45f66..711f13d8c2de 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -84,7 +84,7 @@ error_proc: error_object_jar: misc_deregister(&cachefiles_dev); error_dev: - pr_err("failed to register: %d", ret); + pr_err("failed to register: %d\n", ret); return ret; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 5bf2b41e66d3..dad7d9542a24 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -543,7 +543,7 @@ lookup_again: next, next->d_inode, next->d_inode->i_ino); } else if (!S_ISDIR(next->d_inode->i_mode)) { - pr_err("inode %lu is not a directory", + pr_err("inode %lu is not a directory\n", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -574,7 +574,7 @@ lookup_again: } else if (!S_ISDIR(next->d_inode->i_mode) && !S_ISREG(next->d_inode->i_mode) ) { - pr_err("inode %lu is not a file or directory", + pr_err("inode %lu is not a file or directory\n", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ASSERT(subdir->d_inode); if (!S_ISDIR(subdir->d_inode->i_mode)) { - pr_err("%s is not a directory", dirname); + pr_err("%s is not a directory\n", dirname); ret = -EIO; goto check_error; } @@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || !subdir->d_inode->i_op->create || - !subdir->d_inode->i_op->rename || + (!subdir->d_inode->i_op->rename && + !subdir->d_inode->i_op->rename2) || !subdir->d_inode->i_op->rmdir || !subdir->d_inode->i_op->unlink) goto check_error; @@ -795,13 +796,13 @@ check_error: mkdir_error: mutex_unlock(&dir->d_inode->i_mutex); dput(subdir); - pr_err("mkdir %s failed with error %d", dirname, ret); + pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: mutex_unlock(&dir->d_inode->i_mutex); ret = PTR_ERR(subdir); - pr_err("Lookup %s failed with error %d", dirname, ret); + pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: @@ -891,7 +892,7 @@ lookup_error: if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); } else if (ret != -ENOMEM) { - pr_err("Internal error: %d", ret); + pr_err("Internal error: %d\n", ret); ret = -EIO; } @@ -950,7 +951,7 @@ error: } if (ret != -ENOMEM) { - pr_err("Internal error: %d", ret); + pr_err("Internal error: %d\n", ret); ret = -EIO; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 4b1fb5ca65b8..25e745b8eb1b 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) struct cachefiles_one_read *monitor; struct cachefiles_object *object; struct fscache_retrieval *op; - struct pagevec pagevec; int error, max; op = container_of(_op, struct fscache_retrieval, op); @@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _enter("{ino=%lu}", object->backer->d_inode->i_ino); - pagevec_init(&pagevec, 0); - max = 8; spin_lock_irq(&object->work_lock); @@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, { struct cachefiles_object *object; struct cachefiles_cache *cache; - struct pagevec pagevec; struct inode *inode; sector_t block0, block; unsigned shift; @@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; - pagevec_init(&pagevec, 0); - /* we assume the absence or presence of the first block is a good * enough indication for the page as a whole * - TODO: don't use bmap() for this as it is _not_ actually good diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 1ad51ffbb275..acbc1f094fb1 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } if (ret != -EEXIST) { - pr_err("Can't set xattr on %*.*s [%lu] (err %d)", + pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret == -ERANGE) goto bad_type_length; - pr_err("Can't read xattr on %*.*s [%lu] (err %d)", + pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -85,14 +85,14 @@ error: return ret; bad_type_length: - pr_err("Cache object %lu type xattr length incorrect", + pr_err("Cache object %lu type xattr length incorrect\n", dentry->d_inode->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; - pr_err("Cache object %*.*s [%lu] type %s not %s", + pr_err("Cache object %*.*s [%lu] type %s not %s\n", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, xtype, type); @@ -293,7 +293,7 @@ error: return ret; bad_type_length: - pr_err("Cache object %lu xattr length incorrect", + pr_err("Cache object %lu xattr length incorrect\n", dentry->d_inode->i_ino); ret = -EIO; goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 469f2e8657e8..cebf2ebefb55 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -172,14 +172,24 @@ out: int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; + umode_t new_mode = inode->i_mode; int error; - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + error = posix_acl_create(dir, &new_mode, &default_acl, &acl); if (error) return error; - if (!default_acl && !acl) + if (!default_acl && !acl) { cache_no_acl(inode); + if (new_mode != inode->i_mode) { + struct iattr newattrs = { + .ia_mode = new_mode, + .ia_valid = ATTR_MODE, + }; + error = ceph_setattr(dentry, &newattrs); + } + return error; + } if (default_acl) { error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1fde164b74b5..6d1cd45dca89 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3277,7 +3277,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); - rel->issue_seq = cpu_to_le32(cap->issue_seq), + rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 302085100c28..2eb02f80a0ab 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -423,6 +423,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (!len) + return 0; /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, @@ -470,8 +473,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, size_t left = ret; while (left) { - int copy = min_t(size_t, PAGE_SIZE, left); - l = copy_page_to_iter(pages[k++], 0, copy, i); + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, + PAGE_SIZE - page_off, left); + l = copy_page_to_iter(pages[k++], page_off, + copy, i); off += l; left -= l; if (l < copy) @@ -531,7 +537,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -547,7 +553,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - loff_t pos = iocb->ki_pos; size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -646,7 +651,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -663,7 +669,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - loff_t pos = iocb->ki_pos; size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -918,9 +923,9 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (file->f_flags & O_DIRECT) - written = ceph_sync_direct_write(iocb, &data); + written = ceph_sync_direct_write(iocb, &data, pos); else - written = ceph_sync_write(iocb, &data); + written = ceph_sync_write(iocb, &data, pos); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1177,6 +1182,9 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 92a2548278fc..bad07c09f91e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1904,6 +1904,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (req->r_got_unsafe) { + void *p; /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. @@ -1924,8 +1925,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* remove cap/dentry releases from message */ rhead->num_releases = 0; - msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); - msg->front.iov_len = req->r_request_release_offset; + + /* time stamp */ + p = msg->front.iov_base + req->r_request_release_offset; + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return 0; } @@ -2061,11 +2067,12 @@ static void __wake_requests(struct ceph_mds_client *mdsc, static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; - struct rb_node *p; + struct rb_node *p = rb_first(&mdsc->request_tree); dout("kick_requests mds%d\n", mds); - for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); if (req->r_got_unsafe) continue; if (req->r_session && @@ -2248,6 +2255,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); + req->r_resend_mds = -1; if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 06150fd745ac..f6e12377335c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, goto out; } } else { - root = d_obtain_alias(inode); + root = d_obtain_root(inode); } ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index c9c2b887381e..12f58d22e017 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -592,12 +592,12 @@ start: xattr_version = ci->i_xattrs.version; spin_unlock(&ci->i_ceph_lock); - xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), GFP_NOFS); err = -ENOMEM; if (!xattrs) goto bad_lock; - memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); + for (i = 0; i < numattr; i++) { xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 603f18a65c12..a2172f3f69e3 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -22,6 +22,11 @@ config CIFS support for OS/2 and Windows ME and similar servers is provided as well. + The module also provides optional support for the followon + protocols for CIFS including SMB3, which enables + useful performance and security features (see the description + of CONFIG_CIFS_SMB2). + The cifs module provides an advanced network file system client for mounting to CIFS compliant servers. It includes support for DFS (hierarchical name space), secure per-user @@ -121,7 +126,8 @@ config CIFS_ACL depends on CIFS_XATTR && KEYS help Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. + is handed over to the application/caller. See the man + page for getcifsacl for more information. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB2 - bool "SMB2 network file system support" + bool "SMB2 and SMB3 network file system support" depends on CIFS && INET select NLS select KEYS @@ -170,16 +176,21 @@ config CIFS_SMB2 select DNS_RESOLVER help - This enables experimental support for the SMB2 (Server Message Block - version 2) protocol. The SMB2 protocol is the successor to the - popular CIFS and SMB network file sharing protocols. SMB2 is the - native file sharing mechanism for recent versions of Windows - operating systems (since Vista). SMB2 enablement will eventually - allow users better performance, security and features, than would be - possible with cifs. Note that smb2 mount options also are simpler - (compared to cifs) due to protocol improvements. - - Unless you are a developer or tester, say N. + This enables support for the Server Message Block version 2 + family of protocols, including SMB3. SMB3 support is + enabled on mount by specifying "vers=3.0" in the mount + options. These protocols are the successors to the popular + CIFS and SMB network file sharing protocols. SMB3 is the + native file sharing mechanism for the more recent + versions of Windows (Windows 8 and Windows 2012 and + later) and Samba server and many others support SMB3 well. + In general SMB3 enables better performance, security + and features, than would be possible with CIFS (Note that + when mounting to Samba, due to the CIFS POSIX extensions, + CIFS mounts can provide slightly better POSIX compatibility + than SMB3 mounts do though). Note that SMB2/SMB3 mount + options are also slightly simpler (compared to CIFS) due + to protocol improvements. config CIFS_FSCACHE bool "Provide CIFS client caching support" diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index f3ac4154cbb6..44ec72684df5 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) tcon->nativeFileSystem); } seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\n\tPathComponentMax: %d Status: 0x%d", + "\n\tPathComponentMax: %d Status: %d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 888398067420..9d7996e8e793 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) +{ + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct TCP_Server_Info *server = tcon->ses->server; + + if (server->ops->fallocate) + return server->ops->fallocate(file, tcon, mode, off, len); + + return -EOPNOTSUPP; +} + static int cifs_permission(struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -800,7 +813,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); } -static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) +static int +cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -812,10 +826,11 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) if (!(S_ISREG(inode->i_mode))) return -EINVAL; - /* check if file is oplocked */ - if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || + /* Check if file is oplocked if this is request for new lease */ + if (arg == F_UNLCK || + ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) - return generic_setlease(file, arg, lease); + return generic_setlease(file, arg, lease, priv); else if (tlink_tcon(cfile->tlink)->local_lease && !CIFS_CACHE_READ(CIFS_I(inode))) /* @@ -826,7 +841,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) * knows that the file won't be changed on the server by anyone * else. */ - return generic_setlease(file, arg, lease); + return generic_setlease(file, arg, lease, priv); else return -EAGAIN; } @@ -848,7 +863,7 @@ const struct inode_operations cifs_dir_inode_ops = { .link = cifs_hardlink, .mkdir = cifs_mkdir, .rmdir = cifs_rmdir, - .rename = cifs_rename, + .rename2 = cifs_rename2, .permission = cifs_permission, /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, @@ -908,6 +923,7 @@ const struct file_operations cifs_file_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_strict_ops = { @@ -927,6 +943,7 @@ const struct file_operations cifs_file_strict_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_ops = { @@ -947,6 +964,7 @@ const struct file_operations cifs_file_direct_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_nobrl_ops = { @@ -965,6 +983,7 @@ const struct file_operations cifs_file_nobrl_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_strict_nobrl_ops = { @@ -983,6 +1002,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_nobrl_ops = { @@ -1002,6 +1022,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_dir_ops = { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 70f178a7c759..002e0c173939 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename(struct inode *, struct dentry *, struct inode *, - struct dentry *); +extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, + struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.03" +#define CIFS_VERSION "2.05" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index de6aed8c78e5..25b8392bfdd2 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,11 +70,6 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* used to define string lengths for reversing unicode strings */ -/* (256+1)*2 = 514 */ -/* (max path length + 1 for null) * 2 for unicode */ -#define MAX_NAME 514 - /* SMB echo "timeout" -- FIXME: tunable? */ #define SMB_ECHO_INTERVAL (60 * HZ) @@ -404,6 +399,15 @@ struct smb_version_operations { const struct cifs_fid *, u32 *); int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, int); + /* writepages retry size */ + unsigned int (*wp_retry_size)(struct inode *); + /* get mtu credits */ + int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, + unsigned int *, unsigned int *); + /* check if we need to issue closedir */ + bool (*dir_needs_close)(struct cifsFileInfo *); + long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, + loff_t); }; struct smb_version_values { @@ -640,6 +644,16 @@ add_credits(struct TCP_Server_Info *server, const unsigned int add, } static inline void +add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + if (add) { + server->ops->add_credits(server, add, optype); + wake_up(&server->request_q); + } +} + +static inline void set_credits(struct TCP_Server_Info *server, const int val) { server->ops->set_credits(server, val); @@ -868,6 +882,7 @@ struct cifs_tcon { for this mount even if server would support */ bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ + bool broken_sparse_sup; /* if server or share does not support sparse */ bool need_reconnect:1; /* connection reset, tid now invalid */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ @@ -1044,6 +1059,7 @@ struct cifs_readdata { struct address_space *mapping; __u64 offset; unsigned int bytes; + unsigned int got_bytes; pid_t pid; int result; struct work_struct work; @@ -1053,6 +1069,7 @@ struct cifs_readdata { struct kvec iov; unsigned int pagesz; unsigned int tailsz; + unsigned int credits; unsigned int nr_pages; struct page *pages[]; }; @@ -1073,6 +1090,7 @@ struct cifs_writedata { int result; unsigned int pagesz; unsigned int tailsz; + unsigned int credits; unsigned int nr_pages; struct page *pages[]; }; @@ -1398,6 +1416,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ #define CIFS_OP_MASK 0x0380 /* mask request type */ +#define CIFS_HAS_CREDITS 0x0400 /* already has credits */ /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 33df36ef9d52..5f9822ac0245 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2253,6 +2253,29 @@ typedef struct { /* minimum includes first three fields, and empty FS Name */ #define MIN_FS_ATTR_INFO_SIZE 12 + +/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 +#define FILE_SUPPORTS_USN_JOURNAL 0x02000000 +#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 +#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000 +#define FILE_SUPPORTS_HARD_LINKS 0x00400000 +#define FILE_SUPPORTS_TRANSACTIONS 0x00200000 +#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000 +#define FILE_READ_ONLY_VOLUME 0x00080000 +#define FILE_NAMED_STREAMS 0x00040000 +#define FILE_SUPPORTS_ENCRYPTION 0x00020000 +#define FILE_SUPPORTS_OBJECT_IDS 0x00010000 +#define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 +#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 +#define FILE_SUPPORTS_SPARSE_FILES 0x00000040 +#define FILE_VOLUME_QUOTAS 0x00000020 +#define FILE_FILE_COMPRESSION 0x00000010 +#define FILE_PERSISTENT_ACLS 0x00000008 +#define FILE_UNICODE_ON_DISK 0x00000004 +#define FILE_CASE_PRESERVED_NAMES 0x00000002 +#define FILE_CASE_SENSITIVE_SEARCH 0x00000001 typedef struct { __le32 Attributes; __le32 MaxPathNameComponentLength; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ca7980a1e303..c31ce98c1704 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -36,6 +36,7 @@ extern struct smb_hdr *cifs_buf_get(void); extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); +extern void free_rsp_buf(int, void *); extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, struct kvec *iov); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, @@ -89,6 +90,9 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, struct smb_rqst *); extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); +extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, + unsigned int size, unsigned int *num, + unsigned int *credits); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */ , const int flags); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6ce4e0954b98..66f65001a6d8 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -196,10 +196,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (rc) goto out; - /* - * FIXME: check if wsize needs updated due to negotiated smb buffer - * size shrinking - */ atomic_inc(&tconInfoReconnectCount); /* tell server Unix caps we support */ @@ -1517,7 +1513,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; - rdata->bytes = length; cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", server->total_read, buflen, data_len); @@ -1560,12 +1555,18 @@ cifs_readv_callback(struct mid_q_entry *mid) rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: rdata->result = -EIO; @@ -1734,10 +1735,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ *buf = iov[0].iov_base; @@ -1899,28 +1897,80 @@ cifs_writedata_release(struct kref *refcount) static void cifs_writev_requeue(struct cifs_writedata *wdata) { - int i, rc; + int i, rc = 0; struct inode *inode = wdata->cfile->dentry->d_inode; struct TCP_Server_Info *server; + unsigned int rest_len; - for (i = 0; i < wdata->nr_pages; i++) { - lock_page(wdata->pages[i]); - clear_page_dirty_for_io(wdata->pages[i]); - } - + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; do { - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } while (rc == -EAGAIN); + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_CACHE_SIZE; + if (!nr_pages) { + rc = -ENOTSUPP; + break; + } + cur_len = nr_pages * PAGE_CACHE_SIZE; + tailsz = PAGE_CACHE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + } - for (i = 0; i < wdata->nr_pages; i++) { - unlock_page(wdata->pages[i]); - if (rc != 0) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; } - } + + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + wdata2->cfile = find_writable_file(CIFS_I(inode), false); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + break; + } + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, cifs_writedata_release); + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && rc != -EAGAIN) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + page_cache_release(wdata2->pages[j]); + } + } + + if (rc) { + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc == -EAGAIN) + continue; + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); mapping_set_error(inode->i_mapping, rc); kref_put(&wdata->refcount, cifs_writedata_release); @@ -2203,10 +2253,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, } /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2451,10 +2498,7 @@ plk_err_exit: if (pSMB) cifs_small_buf_release(pSMB); - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -3838,10 +3882,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, } } qsec_out: - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(buf_type, iov[0].iov_base); /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 20d75b8ddb26..36ca2045009b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -557,7 +557,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, try_to_freeze(); if (server_unresponsive(server)) { - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } @@ -571,7 +571,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, break; } else if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } else if (length == -ERESTARTSYS || length == -EAGAIN || @@ -588,7 +588,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, cifs_dbg(FYI, "Received no data or error: expecting %d\n" "got %d", to_read, length); cifs_reconnect(server); - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } } @@ -786,7 +786,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); - return -EAGAIN; + return -ECONNABORTED; } /* switch to large buffer if too big for a small one */ @@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, tmp_end++; if (!(tmp_end < end && tmp_end[1] == delim)) { /* No it is not. Set the password to NULL */ + kfree(vol->password); vol->password = NULL; break; } @@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = end; } + kfree(vol->password); /* Now build new password string */ temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); @@ -3934,13 +3936,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); } -static int -cifs_sb_tcon_pending_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* find and return a tlink with given uid */ static struct tcon_link * tlink_rb_search(struct rb_root *root, kuid_t uid) @@ -4039,11 +4034,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, - cifs_sb_tcon_pending_wait, TASK_INTERRUPTIBLE); if (ret) { cifs_put_tlink(tlink); - return ERR_PTR(ret); + return ERR_PTR(-ERESTARTSYS); } /* if it's good, return it */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3db0c5fd9a11..6cbd9c688cfe 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, goto out; } + if (file->f_flags & O_DIRECT && + CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); if (file_info == NULL) { if (server->ops->close) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e90a1e9aa627..5f29354b072a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file) cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); + if (file->f_flags & O_DIRECT && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + if (server->oplocks) oplock = REQ_OPLOCK; else @@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file) cifs_dbg(FYI, "Freeing private data in close dir\n"); spin_lock(&cifs_file_list_lock); - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); if (server->ops->close_dir) @@ -1670,8 +1678,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, break; } - len = min((size_t)cifs_sb->wsize, - write_size - total_written); + len = min(server->ops->wp_retry_size(dentry->d_inode), + (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; iov[1].iov_len = len; @@ -1878,15 +1886,163 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } +static struct cifs_writedata * +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, + pgoff_t end, pgoff_t *index, + unsigned int *found_pages) +{ + unsigned int nr_pages; + struct page **pages; + struct cifs_writedata *wdata; + + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); + if (!wdata) + return NULL; + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + *found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, index, + PAGECACHE_TAG_DIRTY, tofind, + pages); + *found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && *index <= end); + + return wdata; +} + +static unsigned int +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, + struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) +{ + unsigned int nr_pages = 0, i; + struct page *page; + + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + *done = true; + unlock_page(page); + break; + } + + if (*next && (page->index != *next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= i_size_read(mapping->host)) { + *done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + *next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + *index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + return nr_pages; +} + +static int +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, + struct address_space *mapping, struct writeback_control *wbc) +{ + int rc = 0; + struct TCP_Server_Info *server; + unsigned int i; + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + } else { + wdata->pid = wdata->cfile->pid; + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata, cifs_writedata_release); + } + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + return rc; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; - struct TCP_Server_Info *server; - struct page *page; int rc = 0; /* @@ -1906,152 +2062,50 @@ static int cifs_writepages(struct address_space *mapping, range_whole = true; scanned = true; } + server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages; - pgoff_t next = 0, tofind; - struct page **pages; + unsigned int i, nr_pages, found_pages, wsize, credits; + pgoff_t next = 0, tofind, saved_index = index; - tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, - end - index) + 1; + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; - wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); + tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; + + wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, + &found_pages); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } - /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_CACHE_SIZE. - */ - found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, &index, - PAGECACHE_TAG_DIRTY, - tofind, pages); - found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && index <= end); - if (found_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); break; } - nr_pages = 0; - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } - - if (!wbc->range_cyclic && page->index > end) { - done = true; - unlock_page(page); - break; - } - - if (next && (page->index != next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; - } - - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. - */ - set_page_writeback(page); - - if (page_offset(page) >= i_size_read(mapping->host)) { - done = true; - unlock_page(page); - end_page_writeback(page); - break; - } - - wdata->pages[i] = page; - next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - page_cache_release(wdata->pages[i]); - wdata->pages[i] = NULL; - } + nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, + end, &index, &next, &done); /* nothing to write? */ if (nr_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); continue; } - wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_CACHE_SIZE; - wdata->tailsz = - min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_CACHE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + - wdata->tailsz; - - do { - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), - false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; - } - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, - cifs_writedata_release); - } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); + wdata->credits = credits; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); /* send failure -- clean up the mess */ if (rc != 0) { + add_credits_and_wake_if(server, wdata->credits, 0); for (i = 0; i < nr_pages; ++i) { if (rc == -EAGAIN) redirty_page_for_writepage(wbc, @@ -2066,6 +2120,11 @@ retry: } kref_put(&wdata->refcount, cifs_writedata_release); + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { + index = saved_index; + continue; + } + wbc->nr_to_write -= nr_pages; if (wbc->nr_to_write <= 0) done = true; @@ -2362,123 +2421,109 @@ cifs_uncached_writev_complete(struct work_struct *work) kref_put(&wdata->refcount, cifs_uncached_writedata_release); } -/* attempt to send write to server, retry on any -EAGAIN errors */ static int -cifs_uncached_retry_writev(struct cifs_writedata *wdata) +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, + size_t *len, unsigned long *num_pages) { - int rc; - struct TCP_Server_Info *server; + size_t save_len, copied, bytes, cur_len = *len; + unsigned long i, nr_pages = *num_pages; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + bytes = min_t(const size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + cur_len -= copied; + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; + } + cur_len = save_len - cur_len; + *len = cur_len; - do { - if (wdata->cfile->invalidHandle) { - rc = cifs_reopen_file(wdata->cfile, false); - if (rc != 0) - continue; - } - rc = server->ops->async_writev(wdata, - cifs_uncached_writedata_release); - } while (rc == -EAGAIN); + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Return -EFAULT and let + * the caller free anything we allocated and bail out. + */ + if (!cur_len) + return -EFAULT; - return rc; + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. + */ + *num_pages = i + 1; + return 0; } -static ssize_t -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +static int +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, + struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) { - unsigned long nr_pages, i; - size_t bytes, copied, len, cur_len; - ssize_t total_written = 0; - loff_t offset; - struct cifsFileInfo *open_file; - struct cifs_tcon *tcon; - struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - int rc; + int rc = 0; + size_t cur_len; + unsigned long nr_pages, num_pages, i; + struct cifs_writedata *wdata; + struct iov_iter saved_from; + loff_t saved_offset = offset; pid_t pid; - - len = iov_iter_count(from); - rc = generic_write_checks(file, poffset, &len, 0); - if (rc) - return rc; - - if (!len) - return 0; - - iov_iter_truncate(from, len); - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; - - offset = *poffset; + struct TCP_Server_Info *server; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; + server = tlink_tcon(open_file->tlink)->ses->server; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + do { - size_t save_len; + unsigned int wsize, credits; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; - nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + nr_pages = get_numpages(wsize, len, &cur_len); wdata = cifs_writedata_alloc(nr_pages, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { kfree(wdata); + add_credits_and_wake_if(server, credits, 0); break; } - save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, - from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Set the rc to -EFAULT, - * free anything we allocated and bail out. - */ - if (!cur_len) { + num_pages = nr_pages; + rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages); + if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); kfree(wdata); - rc = -EFAULT; + add_credits_and_wake_if(server, credits, 0); break; } /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. Bring nr_pages down to that, and free - * any pages that we didn't use. + * Bring nr_pages down to the number of pages we actually used, + * and free any pages that we didn't use. */ - for ( ; nr_pages > i + 1; nr_pages--) + for ( ; nr_pages > num_pages; nr_pages--) put_page(wdata->pages[nr_pages - 1]); wdata->sync_mode = WB_SYNC_ALL; @@ -2489,18 +2534,69 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); - rc = cifs_uncached_retry_writev(wdata); + wdata->credits = credits; + + if (!wdata->cfile->invalidHandle || + !cifs_reopen_file(wdata->cfile, false)) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); + if (rc == -EAGAIN) { + memcpy(from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(from, offset - saved_offset); + continue; + } break; } - list_add_tail(&wdata->list, &wdata_list); + list_add_tail(&wdata->list, wdata_list); offset += cur_len; len -= cur_len; } while (len > 0); + return rc; +} + +static ssize_t +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +{ + size_t len; + ssize_t total_written = 0; + struct cifsFileInfo *open_file; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + struct iov_iter saved_from; + int rc; + + len = iov_iter_count(from); + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + if (!len) + return 0; + + iov_iter_truncate(from, len); + + INIT_LIST_HEAD(&wdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb, + &wdata_list); + /* * If at least one write was successfully sent, then discard any rc * value from the later writes. If the other write succeeds, then @@ -2529,7 +2625,25 @@ restart_loop: /* resend call if it's a retryable error */ if (rc == -EAGAIN) { - rc = cifs_uncached_retry_writev(wdata); + struct list_head tmp_list; + struct iov_iter tmp_from; + + INIT_LIST_HEAD(&tmp_list); + list_del_init(&wdata->list); + + memcpy(&tmp_from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(&tmp_from, + wdata->offset - *poffset); + + rc = cifs_write_from_iter(wdata->offset, + wdata->bytes, &tmp_from, + open_file, cifs_sb, &tmp_list); + + list_splice(&tmp_list, &wdata_list); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); goto restart_loop; } } @@ -2722,26 +2836,6 @@ cifs_uncached_readdata_release(struct kref *refcount) cifs_readdata_release(refcount); } -static int -cifs_retry_async_readv(struct cifs_readdata *rdata) -{ - int rc; - struct TCP_Server_Info *server; - - server = tlink_tcon(rdata->cfile->tlink)->ses->server; - - do { - if (rdata->cfile->invalidHandle) { - rc = cifs_reopen_file(rdata->cfile, true); - if (rc != 0) - continue; - } - rc = server->ops->async_readv(rdata); - } while (rc == -EAGAIN); - - return rc; -} - /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data @@ -2754,7 +2848,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) static int cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - size_t remaining = rdata->bytes; + size_t remaining = rdata->got_bytes; unsigned int i; for (i = 0; i < rdata->nr_pages; i++) { @@ -2782,11 +2876,12 @@ static int cifs_uncached_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; struct kvec iov; + rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@ -2820,55 +2915,45 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, if (result < 0) break; - total_read += result; + rdata->got_bytes += result; } - return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static int +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len, cur_len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; - unsigned int npages; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; + struct cifs_readdata *rdata; + unsigned int npages, rsize, credits; + size_t cur_len; + int rc; pid_t pid; + struct TCP_Server_Info *server; - len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + server = tlink_tcon(open_file->tlink)->ses->server; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); - do { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); /* allocate a readdata struct */ rdata = cifs_readdata_alloc(npages, cifs_uncached_readv_complete); if (!rdata) { + add_credits_and_wake_if(server, credits, 0); rc = -ENOMEM; break; } @@ -2884,44 +2969,113 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->credits = credits; - rc = cifs_retry_async_readv(rdata); + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); error: if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); + if (rc == -EAGAIN) + continue; break; } - list_add_tail(&rdata->list, &rdata_list); + list_add_tail(&rdata->list, rdata_list); offset += cur_len; len -= cur_len; } while (len > 0); + return rc; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *open_file; + struct cifs_readdata *rdata, *tmp; + struct list_head rdata_list; + + len = iov_iter_count(to); + if (!len) + return 0; + + INIT_LIST_HEAD(&rdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + /* if at least one read request send succeeded, then reset rc */ if (!list_empty(&rdata_list)) rc = 0; len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ +again: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { - again: if (!rc) { /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) { - rc = rdata->result; + else if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto again; + struct list_head tmp_list; + unsigned int got_bytes = rdata->got_bytes; + + list_del_init(&rdata->list); + INIT_LIST_HEAD(&tmp_list); + + /* + * Got a part of data and then reconnect has + * happened -- fill the buffer and continue + * reading. + */ + if (got_bytes && got_bytes < rdata->bytes) { + rc = cifs_readdata_to_iov(rdata, to); + if (rc) { + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + continue; + } } - } else { + + rc = cifs_send_async_read( + rdata->offset + got_bytes, + rdata->bytes - got_bytes, + rdata->cfile, cifs_sb, + &tmp_list); + + list_splice(&tmp_list, &rdata_list); + + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + goto again; + } else if (rdata->result) + rc = rdata->result; + else rc = cifs_readdata_to_iov(rdata, to); - } + /* if there was a short read -- discard anything left */ + if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) + rc = -ENODATA; } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@ -3030,18 +3184,19 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { - current_read_size = min_t(uint, read_size - total_read, rsize); - /* - * For windows me and 9x we do not want to request more than it - * negotiated since it will refuse the read then. - */ - if ((tcon->ses) && !(tcon->ses->capabilities & + do { + current_read_size = min_t(uint, read_size - total_read, + rsize); + /* + * For windows me and 9x we do not want to request more + * than it negotiated since it will refuse the read + * then. + */ + if ((tcon->ses) && !(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { - current_read_size = min_t(uint, current_read_size, - CIFSMaxBufSize); - } - rc = -EAGAIN; - while (rc == -EAGAIN) { + current_read_size = min_t(uint, + current_read_size, CIFSMaxBufSize); + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) @@ -3054,7 +3209,8 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) rc = server->ops->sync_read(xid, open_file, &io_parms, &bytes_read, &cur_offset, &buf_type); - } + } while (rc == -EAGAIN); + if (rc || (bytes_read == 0)) { if (total_read) { break; @@ -3133,25 +3289,30 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) static void cifs_readv_complete(struct work_struct *work) { - unsigned int i; + unsigned int i, got_bytes; struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work); + got_bytes = rdata->got_bytes; for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; lru_cache_add_file(page); - if (rdata->result == 0) { + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); SetPageUptodate(page); } unlock_page(page); - if (rdata->result == 0) + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); + got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + page_cache_release(page); rdata->pages[i] = NULL; } @@ -3162,7 +3323,7 @@ static int cifs_readpages_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; u64 eof; pgoff_t eof_index; @@ -3174,6 +3335,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); + rdata->got_bytes = 0; rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@ -3228,10 +3390,70 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, if (result < 0) break; - total_read += result; + rdata->got_bytes += result; + } + + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + unsigned int rsize, struct list_head *tmplist, + unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +{ + struct page *page, *tpage; + unsigned int expected_index; + int rc; + + INIT_LIST_HEAD(tmplist); + + page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + return rc; } - return total_read > 0 ? total_read : result; + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + *bytes = PAGE_CACHE_SIZE; + *nr_pages = 1; + list_move_tail(&page->lru, tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (*bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, page->index, + GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_CACHE_SIZE; + expected_index++; + (*nr_pages)++; + } + return rc; } static int cifs_readpages(struct file *file, struct address_space *mapping, @@ -3241,19 +3463,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - unsigned int rsize = cifs_sb->rsize; + struct TCP_Server_Info *server; pid_t pid; /* - * Give up immediately if rsize is too small to read an entire page. - * The VFS will fall back to readpage. We should never reach this - * point however since we set ra_pages to 0 when the rsize is smaller - * than a cache page. - */ - if (unlikely(rsize < PAGE_CACHE_SIZE)) - return 0; - - /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative * @@ -3271,7 +3484,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, pid = current->tgid; rc = 0; - INIT_LIST_HEAD(&tmplist); + server = tlink_tcon(open_file->tlink)->ses->server; cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", __func__, file, mapping, num_pages); @@ -3288,58 +3501,35 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * the rdata->pages, then we want them in increasing order. */ while (!list_empty(page_list)) { - unsigned int i; - unsigned int bytes = PAGE_CACHE_SIZE; - unsigned int expected_index; - unsigned int nr_pages = 1; + unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; + unsigned credits; - page = list_entry(page_list->prev, struct page, lru); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; /* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. + * Give up immediately if rsize is too small to read an entire + * page. The VFS will fall back to readpage. We should never + * reach this point however since we set ra_pages to 0 when the + * rsize is smaller than a cache page. */ - __set_page_locked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL); + if (unlikely(rsize < PAGE_CACHE_SIZE)) { + add_credits_and_wake_if(server, credits, 0); + return 0; + } - /* give up if we can't stick it in the cache */ + rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + &nr_pages, &offset, &bytes); if (rc) { - __clear_page_locked(page); + add_credits_and_wake_if(server, credits, 0); break; } - /* move first page to the tmplist */ - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - list_move_tail(&page->lru, &tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (bytes + PAGE_CACHE_SIZE > rsize) - break; - - __set_page_locked(page); - if (add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL)) { - __clear_page_locked(page); - break; - } - list_move_tail(&page->lru, &tmplist); - bytes += PAGE_CACHE_SIZE; - expected_index++; - nr_pages++; - } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ @@ -3350,6 +3540,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, page_cache_release(page); } rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } @@ -3360,20 +3551,25 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->pid = pid; rdata->pagesz = PAGE_CACHE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->credits = credits; list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; } - rc = cifs_retry_async_readv(rdata); - if (rc != 0) { + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); page_cache_release(page); } + /* Fallback to the readpage in error/reconnect cases */ kref_put(&rdata->refcount, cifs_readdata_release); break; } @@ -3618,13 +3814,6 @@ static int cifs_launder_page(struct page *page) return rc; } -static int -cifs_pending_writers_wait(void *unused) -{ - schedule(); - return 0; -} - void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3636,7 +3825,7 @@ void cifs_oplock_break(struct work_struct *work) int rc = 0; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, - cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); server->ops->downgrade_oplock(server, cinode, test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a174605f6afa..7899a40465b3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1627,8 +1627,9 @@ do_rename_exit: } int -cifs_rename(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry) +cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry, + unsigned int flags) { char *from_name = NULL; char *to_name = NULL; @@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, unsigned int xid; int rc, tmprc; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + cifs_sb = CIFS_SB(source_dir->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + /* + * No-replace is the natural behavior for CIFS, so skip unlink hacks. + */ + if (flags & RENAME_NOREPLACE) + goto cifs_rename_exit; + if (rc == -EEXIST && tcon->unix_ext) { /* * Are src and dst hardlinks of same inode? We can only tell @@ -1710,13 +1720,22 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { - tmprc = cifs_unlink(target_dir, target_dentry); + if (d_is_dir(target_dentry)) + tmprc = cifs_rmdir(target_dir, target_dentry); + else + tmprc = cifs_unlink(target_dir, target_dentry); if (tmprc) goto cifs_rename_exit; rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); } + /* force revalidate to go get info when needed */ + CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; + + source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = + target_dir->i_mtime = current_fs_time(source_dir->i_sb); + cifs_rename_exit: kfree(info_buf_source); kfree(from_name); @@ -1780,7 +1799,7 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(void *word) +cifs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -1794,8 +1813,8 @@ cifs_revalidate_mapping(struct inode *inode) int rc; unsigned long *flags = &CIFS_I(inode)->flags; - rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 68559fd557fb..5657416d3483 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto out; - rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, - fromName, buf, &bytes_written); + if (tcon->ses->server->ops->create_mf_symlink) + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, + cifs_sb, fromName, buf, &bytes_written); + else + rc = -EOPNOTSUPP; + if (rc) goto out; @@ -339,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + rc = -ENOENT; /* it's not a symlink */ goto out; + } io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3b0c62e622da..b7415d596dbd 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -226,6 +226,15 @@ cifs_small_buf_release(void *buf_to_free) return; } +void +free_rsp_buf(int resp_buftype, void *rsp) +{ + if (resp_buftype == CIFS_SMALL_BUFFER) + cifs_small_buf_release(rsp); + else if (resp_buftype == CIFS_LARGE_BUFFER) + cifs_buf_release(rsp); +} + /* NB: MID can not be set if treeCon not passed in, in that case it is responsbility of caller to set the mid */ void @@ -414,7 +423,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) return true; } if (pSMBr->hdr.Status.CifsError) { - cifs_dbg(FYI, "notify err 0x%d\n", + cifs_dbg(FYI, "notify err 0x%x\n", pSMBr->hdr.Status.CifsError); return true; } @@ -441,7 +450,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->hdr.WordCount != 8) return false; - cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n", + cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n", pSMB->LockType, pSMB->OplockLevel); if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; @@ -565,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } -static int -cifs_oplock_break_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* * We wait for oplock breaks to be processed before we attempt to perform * writes. @@ -582,7 +584,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode) start: rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, - cifs_oplock_break_wait, TASK_KILLABLE); + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 6834b9c3bec1..b333ff60781d 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc) /* BB what about the timezone? BB */ /* Subtract the NTFS time offset, then convert to 1s intervals. */ - u64 t; + s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + t = -t; + ts.tv_nsec = (long)(do_div(t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -t; + } else { + ts.tv_nsec = (long)do_div(t, 10000000) * 100; + ts.tv_sec = t; + } - t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; return ts; } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b15862e0f68c..b334a89d6a66 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -593,11 +593,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); spin_lock(&cifs_file_list_lock); - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); - if (server->ops->close) - server->ops->close(xid, tcon, &cfile->fid); + if (server->ops->close_dir) + server->ops->close_dir(xid, tcon, &cfile->fid); } else spin_unlock(&cifs_file_list_lock); if (cfile->srch_inf.ntwrk_buf_start) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index e87387dbf39f..57db63ff88da 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); - if (ses->serverOS) + if (ses->serverOS) { strncpy(ses->serverOS, bcc_ptr, len); - if (strncmp(ses->serverOS, "OS/2", 4) == 0) - cifs_dbg(FYI, "OS/2 server\n"); + if (strncmp(ses->serverOS, "OS/2", 4) == 0) + cifs_dbg(FYI, "OS/2 server\n"); + } bcc_ptr += len + 1; bleft -= len + 1; @@ -520,382 +521,551 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) } } -int -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) { - int rc = 0; - int wct; + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb_hdr *smb_buf; - char *bcc_ptr; - char *str_area; - SESSION_SETUP_ANDX *pSMB; - __u32 capabilities; - __u16 count; - int resp_buf_type; - struct kvec iov[3]; - enum securityEnum type; - __u16 action, bytes_remaining; - struct key *spnego_key = NULL; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - u16 blob_len; - char *ntlmsspblob = NULL; - if (ses == NULL) { - WARN(1, "%s: ses == NULL!", __func__); - return -EINVAL; - } + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); - type = select_sectype(ses->server, ses->sectype); - cifs_dbg(FYI, "sess setup type %d\n", type); - if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); - return -EINVAL; + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; } - if (type == RawNTLMSSP) { - /* if memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = false; + return 0; +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{ + + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; } + mutex_unlock(&ses->server->srv_mutex); -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - if (type == LANMAN) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if ((type == NTLM) || (type == NTLMv2)) { - /* For NTLMv2 failures eventually may need to retry NTLM */ - wct = 13; /* old style NTLM sessionsetup */ - } else /* same size: negotiate or auth, NTLMSSP or extended security */ - wct = 12; + return 0; +} - rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, - (void **)&smb_buf); - if (rc) - return rc; +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count; + + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); - pSMB = (SESSION_SETUP_ANDX *)smb_buf; + return rc; +} +/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; capabilities = cifs_ssetup_hdr(ses, pSMB); - /* we will send the SMB in three pieces: - a fixed length beginning part, an optional - SPNEGO blob (which can be zero length), and a - last part which will include the strings - and rest of bcc area. This allows us to avoid - a large buffer 17K allocation */ - iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; - - /* setting this here allows the code at the end of the function - to free the request buffer if there's an error */ - resp_buf_type = CIFS_SMALL_BUFFER; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - /* 2000 big enough to fit max user, domain, NOS name etc. */ - str_area = kmalloc(2000, GFP_KERNEL); - if (str_area == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - bcc_ptr = str_area; + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - iov[1].iov_base = NULL; - iov[1].iov_len = 0; + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); - if (type == LANMAN) { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */ - pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); - /* no capabilities flags in old lanman negotiation */ + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* can not sign if LANMAN negotiated so no need - to calculate signing key? but what if server - changed to do higher than lanman dialect and - we reconnected would we ever calc signing_key? */ + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} - cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #endif - } else if (type == NTLM) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = + +static void +sess_auth_ntlm(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = + pSMB->req_no_secext.CaseSensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", rc); - goto ssetup_exit; - } + goto out; + } - /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == NTLMv2) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - - /* LM2 password would be here if we supported it */ - pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", - rc); - goto ssetup_exit; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } - if (ses->capabilities & CAP_UNICODE) { - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; - msg = spnego_key->payload.data; - /* check version field to make sure that cifs.upcall is - sending us a response in an expected form */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, "incorrect version of cifs.upcall " - "expected %d but got %d)", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities = cpu_to_le32(capabilities); - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = msg->secblob_len; - pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len); - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_oslm_strings(&bcc_ptr, nls_cp); - unicode_domain_string(&bcc_ptr, ses, nls_cp); - } else - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -#else /* ! CONFIG_CIFS_UPCALL */ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - rc = -ENOSYS; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (type == RawNTLMSSP) { - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - rc = -ENOSYS; - goto ssetup_exit; - } + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities |= cpu_to_le32(capabilities); - switch(phase) { - case NtLmNegotiate: - build_ntlmssp_negotiate_blob( - pSMB->req.SecurityBlob, ses); - iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = pSMB->req.SecurityBlob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); - break; - case NtLmAuthenticate: - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc( - 5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto ssetup_exit; - } + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } - rc = build_ntlmssp_auth_blob(ntlmsspblob, - &blob_len, ses, nls_cp); - if (rc) - goto ssetup_exit; - iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); - /* - * Make sure that we tell the server that we are using - * the uid that it just gave us back on the response - * (challenge) - */ - smb_buf->Uid = ses->Suid; - break; - default: - cifs_dbg(VFS, "invalid phase %d\n", phase); - rc = -ENOSYS; - goto ssetup_exit; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +static void +sess_auth_ntlmv2(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } + + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (sess_data->iov[0].iov_len % 2) { *bcc_ptr = 0; bcc_ptr++; } - unicode_oslm_strings(&bcc_ptr, nls_cp); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } else { - cifs_dbg(VFS, "secType %d not supported!\n", type); - rc = -ENOSYS; - goto ssetup_exit; + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } - iov[2].iov_base = str_area; - iov[2].iov_len = (long) bcc_ptr - (long) str_area; - count = iov[1].iov_len + iov[2].iov_len; - smb_buf->smb_buf_length = - cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; - put_bcc(count, smb_buf); + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, - CIFS_LOG_ERROR); - /* SMB request buf freed in SendReceive2 */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; - smb_buf = (struct smb_hdr *)iov[0].iov_base; + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } - if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && - (smb_buf->Status.CifsError == - cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + struct key *spnego_key = NULL; + struct cifs_spnego_msg *msg; + u16 blob_len; + + /* extended security */ + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; + } + + msg = spnego_key->payload.data; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "incorrect version of cifs.upcall (expected %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; } - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROC rc is not an error here, but expected */ + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); + } else { + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); if (rc) - goto ssetup_exit; + goto out_put_spnego_key; - if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto ssetup_exit; + goto out_put_spnego_key; } - action = le16_to_cpu(pSMB->resp.Action); - if (action & GUEST_LOGIN) + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* response can have either 3 or 4 word count - Samba sends 3 */ - /* and lanman response is 3 */ + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf); - if (smb_buf->WordCount == 4) { - blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); - if (blob_len > bytes_remaining) { - cifs_dbg(VFS, "bad security blob length %d\n", - blob_len); - rc = -EINVAL; - goto ssetup_exit; - } - if (phase == NtLmChallenge) { - rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); - /* now goto beginning for ntlmssp authenticate phase */ - if (rc) - goto ssetup_exit; - } - bcc_ptr += blob_len; - bytes_remaining -= blob_len; + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_put_spnego_key; } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; /* BB check if Unicode and decode strings */ if (bytes_remaining == 0) { @@ -906,60 +1076,362 @@ ssetup_ntlmssp_authenticate: ++bcc_ptr; --bytes_remaining; } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } -ssetup_exit: - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + rc = sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#endif /* ! CONFIG_CIFS_UPCALL */ + +/* + * The required kvec buffers have to be allocated before calling this + * function. + */ +static int +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) +{ + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + char *bcc_ptr; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); + return -ENOSYS; } - kfree(str_area); - kfree(ntlmsspblob); - ntlmsspblob = NULL; - if (resp_buf_type == CIFS_SMALL_BUFFER) { - cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); - cifs_small_buf_release(iov[0].iov_base); - } else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + + bcc_ptr = sess_data->iov[2].iov_base; + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + return 0; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data); + +static void +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n"); + + /* + * if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out; + } + ses->ntlmssp->sesskey_per_smbsess = false; + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + + /* Build security blob before we assemble the request */ + build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses); + sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + sess_data->iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out; + + rc = sess_sendreceive(sess_data); + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)) + rc = 0; + + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out; + } + + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); +out: + sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = - kmemdup(ses->auth_key.response, - ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - rc = -ENOMEM; - mutex_unlock(&ses->server->srv_mutex); - goto keycp_exit; - } - ses->server->session_key.len = - ses->auth_key.len; - } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; - } - mutex_unlock(&ses->server->srv_mutex); + sess_data->func = sess_auth_rawntlmssp_authenticate; + return; + } + + /* Else error. Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + char *ntlmsspblob = NULL; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); - cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + /* Build security blob before we assemble the request */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + rc = -ENOMEM; + goto out; } -keycp_exit: + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, sess_data->nls_cp); + if (rc) + goto out_free_ntlmsspblob; + sess_data->iov[1].iov_len = blob_len; + sess_data->iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_free_ntlmsspblob; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_free_ntlmsspblob; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + +out_free_ntlmsspblob: + kfree(ntlmsspblob); +out: + sess_free_buffer(sess_data); + + if (!rc) + rc = sess_establish_session(sess_data); + + /* Cleanup */ kfree(ses->auth_key.response); ses->auth_key.response = NULL; kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +{ + int type; + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } + + switch (type) { + case LANMAN: + /* LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + sess_data->func = sess_auth_lanman; + break; +#else + return -EOPNOTSUPP; +#endif + case NTLM: + sess_data->func = sess_auth_ntlm; + break; + case NTLMv2: + sess_data->func = sess_auth_ntlmv2; + break; + case Kerberos: +#ifdef CONFIG_CIFS_UPCALL + sess_data->func = sess_auth_kerberos; + break; +#else + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + return -ENOSYS; + break; +#endif /* CONFIG_CIFS_UPCALL */ + case RawNTLMSSP: + sess_data->func = sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", type); + return -ENOSYS; + } + + return 0; +} + +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct sess_data *sess_data; + + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = select_sec(ses, sess_data); + if (rc) + goto out; + + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + /* Store result before we free sess_data */ + rc = sess_data->result; +out: + kfree(sess_data); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d1fdfa848703..52131d8cb4d5 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, tmprc = CIFS_open(xid, &oparms, &oplock, NULL); if (tmprc == -EOPNOTSUPP) *symlink = true; - else + else if (tmprc == 0) CIFSSMBClose(xid, tcon, fid.netfid); } @@ -1009,6 +1009,18 @@ cifs_is_read_op(__u32 oplock) return oplock == OPLOCK_READ; } +static unsigned int +cifs_wp_retry_size(struct inode *inode) +{ + return CIFS_SB(inode->i_sb)->wsize; +} + +static bool +cifs_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1019,6 +1031,7 @@ struct smb_version_operations smb1_operations = { .set_credits = cifs_set_credits, .get_credits_field = cifs_get_credits_field, .get_credits = cifs_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = cifs_get_next_mid, .read_data_offset = cifs_read_data_offset, .read_data_length = cifs_read_data_length, @@ -1078,6 +1091,8 @@ struct smb_version_operations smb1_operations = { .query_mf_symlink = cifs_query_mf_symlink, .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, + .wp_retry_size = cifs_wp_retry_size, + .dir_needs_close = cifs_dir_needs_close, #ifdef CONFIG_CIFS_XATTR .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 3f17b4550831..45992944e238 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; } - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 84c012a6aba0..899bbc86f73e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -91,7 +91,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, case SMB2_OP_SET_EOF: tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, fid.volatile_fid, current->tgid, - (__le64 *)data); + (__le64 *)data, false); break; case SMB2_OP_SET_INFO: tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, @@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, *adjust_tz = false; *symlink = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 94bd4fbb13d3..8257a5a97cc0 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, - {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, @@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP, + "STATUS_REPARSE_NOT_HANDLED"}, {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, "STATUS_DEVICE_REQUIRES_CLEANING"}, {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, @@ -298,7 +300,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"}, {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"}, {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"}, - {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"}, + {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"}, {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"}, {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"}, {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"}, @@ -605,7 +607,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"}, {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"}, {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"}, - {STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"}, + {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"}, {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"}, {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"}, {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b8021fde987d..4aa7a0f07d6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length) /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; - /* server can return one byte more */ + /* server can return one byte more due to implied bcc[0] */ if (clc_len == 4 + len + 1) return 0; + + /* + * MacOS server pads after SMB2.1 write response with 3 bytes + * of junk. Other servers match RFC1001 len to actual + * SMB2/SMB3 frame length (header + smb2 response specific data) + * Log the server error (once), but allow it and continue + * since the frame is parseable. + */ + if (clc_len < 4 /* RFC1001 header size */ + len) { + printk_once(KERN_WARNING + "SMB2 server sent bad RFC1001 len %d not %d\n", + len, clc_len - 4); + return 0; + } + return 1; } return 0; @@ -437,7 +452,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, continue; cifs_dbg(FYI, "found in the open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); server->ops->set_oplock_level(cinode, lease_state, 0, NULL); @@ -467,7 +482,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, } cifs_dbg(FYI, "found in the pending open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); open->oplock = lease_state; @@ -546,7 +561,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return false; } - cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel); + cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 787844bde384..f522193b7184 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <linux/falloc.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -112,6 +113,53 @@ smb2_get_credits(struct mid_q_entry *mid) return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); } +static int +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + int rc = 0; + unsigned int scredits; + + spin_lock(&server->req_lock); + while (1) { + if (server->credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, &server->credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + scredits = server->credits; + /* can deadlock with reopen */ + if (scredits == 1) { + *num = SMB2_MAX_BUFFER_SIZE; + *credits = 0; + break; + } + + /* leave one credit for a possible reopen */ + scredits--; + *num = min_t(unsigned int, size, + scredits * SMB2_MAX_BUFFER_SIZE); + + *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + server->credits -= *credits; + server->in_flight++; + break; + } + } + spin_unlock(&server->req_lock); + return rc; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@ -182,8 +230,9 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* set it to the maximum buffer size value we can send with 1 credit */ - wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); return wsize; } @@ -197,8 +246,9 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* set it to the maximum buffer size value we can send with 1 credit */ - rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); return rsize; } @@ -339,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; @@ -681,13 +731,74 @@ smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, return SMB2_write(xid, parms, written, iov, nr_segs); } +/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */ +static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse) +{ + struct cifsInodeInfo *cifsi; + int rc; + + cifsi = CIFS_I(inode); + + /* if file already sparse don't bother setting sparse again */ + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse) + return true; /* already sparse */ + + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse) + return true; /* already not sparse */ + + /* + * Can't check for sparse support on share the usual way via the + * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share + * since Samba server doesn't set the flag on the share, yet + * supports the set sparse FSCTL and returns sparse correctly + * in the file attributes. If we fail setting sparse though we + * mark that server does not support sparse files for this share + * to avoid repeatedly sending the unsupported fsctl to server + * if the file is repeatedly extended. + */ + if (tcon->broken_sparse_sup) + return false; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_SPARSE, + true /* is_fctl */, &setsparse, 1, NULL, NULL); + if (rc) { + tcon->broken_sparse_sup = true; + cifs_dbg(FYI, "set sparse rc = %d\n", rc); + return false; + } + + if (setsparse) + cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE; + else + cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE); + + return true; +} + static int smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, __u64 size, bool set_alloc) { __le64 eof = cpu_to_le64(size); + struct inode *inode; + + /* + * If extending file more than one page make sparse. Many Linux fs + * make files sparse by default when extending via ftruncate + */ + inode = cfile->dentry->d_inode; + + if (!set_alloc && (size > inode->i_size + 8192)) { + __u8 set_sparse = 1; + + /* whether set sparse succeeds or not, extend the file */ + smb2_set_sparse(xid, tcon, cfile, inode, set_sparse); + } + return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, &eof, false); } static int @@ -904,6 +1015,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len, bool keep_size) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + + xid = get_xid(); + + inode = cfile->dentry->d_inode; + cifsi = CIFS_I(inode); + + /* if file not oplocked can't be sure whether asking to extend size */ + if (!CIFS_CACHE_READ(cifsi)) + if (keep_size == false) + return -EOPNOTSUPP; + + /* + * Must check if file sparse since fallocate -z (zero range) assumes + * non-sparse allocation + */ + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) + return -EOPNOTSUPP; + + /* + * need to make sure we are not asked to extend the file since the SMB3 + * fsctl does not change the file size. In the future we could change + * this to zero the first part of the range then set the file size + * which for a non sparse file would zero the newly extended range + */ + if (keep_size == false) + if (i_size_read(inode) < offset + len) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + +static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + __u8 set_sparse = 1; + + xid = get_xid(); + + inode = cfile->dentry->d_inode; + cifsi = CIFS_I(inode); + + /* Need to make file sparse, if not already, before freeing range. */ + /* Consider adding equivalent for compressed since it could also work */ + if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + +static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, + loff_t off, loff_t len) +{ + /* KEEP_SIZE already checked for by do_fallocate */ + if (mode & FALLOC_FL_PUNCH_HOLE) + return smb3_punch_hole(file, tcon, off, len); + else if (mode & FALLOC_FL_ZERO_RANGE) { + if (mode & FALLOC_FL_KEEP_SIZE) + return smb3_zero_range(file, tcon, off, len, true); + return smb3_zero_range(file, tcon, off, len, false); + } + + return -EOPNOTSUPP; +} + static void smb2_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, bool set_level2) @@ -1104,6 +1314,19 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch) return le32_to_cpu(lc->lcontext.LeaseState); } +static unsigned int +smb2_wp_retry_size(struct inode *inode) +{ + return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize, + SMB2_MAX_BUFFER_SIZE); +} + +static bool +smb2_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->invalidHandle; +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -1113,6 +1336,7 @@ struct smb_version_operations smb20_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1177,6 +1401,8 @@ struct smb_version_operations smb20_operations = { .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, }; struct smb_version_operations smb21_operations = { @@ -1188,6 +1414,7 @@ struct smb_version_operations smb21_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1252,6 +1479,8 @@ struct smb_version_operations smb21_operations = { .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, }; struct smb_version_operations smb30_operations = { @@ -1263,6 +1492,7 @@ struct smb_version_operations smb30_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1330,6 +1560,9 @@ struct smb_version_operations smb30_operations = { .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, .validate_negotiate = smb3_validate_negotiate, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b0b260dbb19d..74b3a6684383 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -108,7 +108,6 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , if (!tcon) goto out; - /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */ /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ if ((tcon->ses) && @@ -245,10 +244,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (rc) goto out; atomic_inc(&tconInfoReconnectCount); - /* - * BB FIXME add code to check if wsize needs update due to negotiated - * smb buffer size shrinking. - */ out: /* * Check if handle based operation so we know whether we can continue @@ -309,16 +304,6 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } -static void -free_rsp_buf(int resp_buftype, void *rsp) -{ - if (resp_buftype == CIFS_SMALL_BUFFER) - cifs_small_buf_release(rsp); - else if (resp_buftype == CIFS_LARGE_BUFFER) - cifs_buf_release(rsp); -} - - /* * * SMB2 Worker functions follow: @@ -545,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, struct smb2_sess_setup_rsp *rsp = NULL; struct kvec iov[2]; int rc = 0; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; @@ -922,7 +907,8 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - tcon->bad_network_name = true; + if (tcon) + tcon->bad_network_name = true; } goto tcon_exit; } @@ -1239,7 +1225,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cifs_dbg(FYI, "SMB2 IOCTL\n"); - *out_data = NULL; + if (out_data != NULL) + *out_data = NULL; + /* zero out returned data len, in case of error */ if (plen) *plen = 0; @@ -1415,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, rsp = (struct smb2_close_rsp *)iov[0].iov_base; if (rc != 0) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); goto close_exit; } @@ -1545,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, { return query_info(xid, tcon, persistent_fid, volatile_fid, FILE_ALL_INFORMATION, - sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, sizeof(struct smb2_file_all_info), data); } @@ -1738,12 +1725,18 @@ smb2_readv_callback(struct mid_q_entry *mid) rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: if (rdata->result != -ENODATA) @@ -1762,11 +1755,12 @@ smb2_readv_callback(struct mid_q_entry *mid) int smb2_async_readv(struct cifs_readdata *rdata) { - int rc; + int rc, flags = 0; struct smb2_hdr *buf; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; + struct TCP_Server_Info *server; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -1777,18 +1771,41 @@ smb2_async_readv(struct cifs_readdata *rdata) io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; io_parms.pid = rdata->pid; + + server = io_parms.tcon->ses->server; + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); - if (rc) + if (rc) { + if (rc == -EAGAIN && rdata->credits) { + /* credits was reset by reconnect */ + rdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } return rc; + } buf = (struct smb2_hdr *)rdata->iov.iov_base; /* 4 for rfc1002 length field */ rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; + if (rdata->credits) { + buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += rdata->credits - + le16_to_cpu(buf->CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - rdata, 0); + rdata, flags); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@ -1906,15 +1923,25 @@ int smb2_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)) { - int rc = -EACCES; + int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov; struct smb_rqst rqst; rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); - if (rc) + if (rc) { + if (rc == -EAGAIN && wdata->credits) { + /* credits was reset by reconnect */ + wdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } goto async_writev_out; + } req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); @@ -1947,9 +1974,20 @@ smb2_async_writev(struct cifs_writedata *wdata, inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); + if (wdata->credits) { + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += wdata->credits - + le16_to_cpu(req->hdr.CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&wdata->refcount); - rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - smb2_writev_callback, wdata, 0); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, + flags); if (rc) { kref_put(&wdata->refcount, release); @@ -2141,6 +2179,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; if (rc) { + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { + srch_inf->endOfSearch = true; + rc = 0; + } cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } @@ -2178,11 +2220,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_dbg(VFS, "illegal search buffer type\n"); - if (rsp->hdr.Status == STATUS_NO_MORE_FILES) - srch_inf->endOfSearch = 1; - else - srch_inf->endOfSearch = 0; - return rc; qdir_exit: @@ -2325,7 +2362,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof) + u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc) { struct smb2_file_eof_info info; void *data; @@ -2336,8 +2373,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); - return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, - FILE_END_OF_FILE_INFORMATION, 1, &data, &size); + if (is_falloc) + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size); + else + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size); } int diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 69f3595d3952..fbe486c285a9 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -573,6 +573,12 @@ struct copychunk_ioctl { __u32 Reserved2; } __packed; +/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + struct copychunk_ioctl_rsp { __le32 ChunksWritten; __le32 ChunkBytesWritten; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 0ce48db20a65..67e8ce8055de 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -139,7 +139,7 @@ extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, __le16 *target_file); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof); + __le64 *eof, bool is_fallocate); extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 59c748ce872f..5111e7272db6 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -466,7 +466,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) { + unsigned int i, num = le16_to_cpu(hdr->CreditCharge); + hdr->MessageId = get_next_mid64(server); + /* skip message numbers according to CreditCharge field */ + for (i = 1; i < num; i++) + get_next_mid(server); } static struct mid_q_entry * diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 0e538b5c9622..83efa59535be 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -63,7 +63,7 @@ #define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ #define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ #define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ -#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */ +#define FSCTL_SET_ZERO_DATA 0x000980C8 #define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ #define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ #define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 18cd5650a5fc..9d087f4e7d4e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -448,6 +448,15 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout, return wait_for_free_credits(server, timeout, val); } +int +cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + *num = size; + *credits = 0; + return 0; +} + static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { @@ -531,20 +540,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, { int rc, timeout, optype; struct mid_q_entry *mid; + unsigned int credits = 0; timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; - rc = wait_for_free_request(server, timeout, optype); - if (rc) - return rc; + if ((flags & CIFS_HAS_CREDITS) == 0) { + rc = wait_for_free_request(server, timeout, optype); + if (rc) + return rc; + credits = 1; + } mutex_lock(&server->srv_mutex); mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { mutex_unlock(&server->srv_mutex); - add_credits(server, 1, optype); - wake_up(&server->request_q); + add_credits_and_wake_if(server, credits, optype); return PTR_ERR(mid); } @@ -572,8 +584,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, return 0; cifs_delete_mid(mid); - add_credits(server, 1, optype); - wake_up(&server->request_q); + add_credits_and_wake_if(server, credits, optype); return rc; } diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 1da168c61d35..278f8fdeb9ef 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/list.h> #include <linux/sched.h> diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2849f41e72a2..1326d38960db 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/coda.h> diff --git a/fs/coda/dir.c b/fs/coda/dir.c index cd8a63238b11..9c3dedc000d1 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -19,8 +19,7 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/namei.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/file.c b/fs/coda/file.c index 9e83b7790212..d244d743a232 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/inode.c b/fs/coda/inode.c index fe3afb2de880..b945410bfcd5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -21,9 +21,7 @@ #include <linux/vfs.h> #include <linux/slab.h> #include <linux/pid_namespace.h> - -#include <asm/uaccess.h> - +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/vmalloc.h> diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 3f5de96bbb58..4326d172fc27 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -16,7 +16,7 @@ #include <linux/string.h> #include <linux/namei.h> #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 5c1e4242368b..822629126e89 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -40,7 +40,7 @@ #include <linux/pid_namespace.h> #include <asm/io.h> #include <asm/poll.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 21fcf8dcb9cd..5bb6e27298a4 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -27,7 +27,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/vfs.h> diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index e82289047272..afec6450450f 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -59,7 +59,7 @@ #include <linux/gfp.h> #include <net/bluetooth/bluetooth.h> -#include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ddcfe590b8a8..355c522f3585 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -11,6 +11,8 @@ * The actual compression is based on zlib, see the other files. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -21,7 +23,7 @@ #include <linux/vfs.h> #include <linux/mutex.h> #include <uapi/linux/cramfs_fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; -static struct super_block * buffer_dev[READ_BUFFERS]; +static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* @@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { wait_on_page_locked(page); if (!PageUptodate(page)) { @@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i data = read_buffers[buffer]; for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { memcpy(data, kmap(page), PAGE_CACHE_SIZE); kunmap(page); @@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); kfree(sbi); } @@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* check for wrong endianness */ if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); return -EINVAL; } @@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&read_mutex); if (super.magic != CRAMFS_MAGIC) { if (super.magic == CRAMFS_MAGIC_WEND && !silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); else if (!silent) - printk(KERN_ERR "cramfs: wrong magic\n"); + pr_err("wrong magic\n"); return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { - printk(KERN_ERR "cramfs: unsupported filesystem features\n"); + pr_err("unsupported filesystem features\n"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { - printk(KERN_ERR "cramfs: root is not a directory\n"); + pr_err("root is not a directory\n"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ @@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) root_offset = super.root.offset << 2; if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { - sbi->size=super.size; - sbi->blocks=super.fsid.blocks; - sbi->files=super.fsid.files; + sbi->size = super.size; + sbi->blocks = super.fsid.blocks; + sbi->files = super.fsid.files; } else { - sbi->size=1<<28; - sbi->blocks=0; - sbi->files=0; + sbi->size = 1<<28; + sbi->blocks = 0; + sbi->files = 0; } - sbi->magic=super.magic; - sbi->flags=super.flags; + sbi->magic = super.magic; + sbi->flags = super.flags; if (root_offset == 0) - printk(KERN_INFO "cramfs: empty filesystem"); + pr_info("empty filesystem"); else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { - printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); + pr_err("bad root offset %lu\n", root_offset); return -EINVAL; } @@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx) /* * Lookup and fill in the inode data.. */ -static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; @@ -483,7 +488,7 @@ out: return NULL; } -static int cramfs_readpage(struct file *file, struct page * page) +static int cramfs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; u32 maxblock; @@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page) if (compr_len == 0) ; /* hole */ else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { - pr_err("cramfs: bad compressed blocksize %u\n", + pr_err("bad compressed blocksize %u\n", compr_len); goto err; } else { diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 1760c1b84d97..ec4f1d4fdad0 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -15,6 +15,8 @@ * then is used by multiple filesystems. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/vmalloc.h> @@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) err = zlib_inflateReset(&stream); if (err != Z_OK) { - printk("zlib_inflateReset error %d\n", err); + pr_err("zlib_inflateReset error %d\n", err); zlib_inflateEnd(&stream); zlib_inflateInit(&stream); } @@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) return stream.total_out; err: - printk("Error %d while decompressing!\n", err); - printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); + pr_err("Error %d while decompressing!\n", err); + pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); return -EIO; } @@ -57,7 +59,7 @@ int cramfs_uncompress_init(void) { if (!initialized++) { stream.workspace = vmalloc(zlib_inflate_workspacesize()); - if ( !stream.workspace ) { + if (!stream.workspace) { initialized = 0; return -ENOMEM; } diff --git a/fs/dcache.c b/fs/dcache.c index 06f65857a855..cb25a1a5e307 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> d_hash_shift); - return dentry_hashtable + (hash & d_hash_mask); + return dentry_hashtable + hash_32(hash, d_hash_shift); } /* Statistics gathering. */ @@ -731,8 +730,6 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question - * @want_discon: flag, used by d_splice_alias, to request - * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -741,10 +738,9 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one unless @want_discon is set, - * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + * any other hashed alias over that one. */ -static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias, *discon_alias; @@ -756,7 +752,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else if (!want_discon) { + } else { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -768,12 +764,9 @@ again: alias = discon_alias; spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; } spin_unlock(&alias->d_lock); goto again; @@ -787,7 +780,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode, 0); + de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; @@ -1781,25 +1774,7 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -/** - * d_obtain_alias - find or allocate a dentry for a given inode - * @inode: inode to allocate the dentry for - * - * Obtain a dentry for an inode resulting from NFS filehandle conversion or - * similar open by handle operations. The returned dentry may be anonymous, - * or may have a full name (if the inode was already in the cache). - * - * When called on a directory inode, we must ensure that the inode only ever - * has one dentry. If a dentry is found, that is returned instead of - * allocating a new one. - * - * On successful return, the reference to the inode has been transferred - * to the dentry. In case of an error the reference on the inode is released. - * To make it easier to use in export operations a %NULL or IS_ERR inode may - * be passed in and will be the error will be propagate to the return value, - * with a %NULL @inode replaced by ERR_PTR(-ESTALE). - */ -struct dentry *d_obtain_alias(struct inode *inode) +static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; @@ -1830,7 +1805,10 @@ struct dentry *d_obtain_alias(struct inode *inode) } /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; + add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; spin_lock(&tmp->d_lock); tmp->d_inode = inode; @@ -1851,59 +1829,51 @@ struct dentry *d_obtain_alias(struct inode *inode) iput(inode); return res; } -EXPORT_SYMBOL(d_obtain_alias); /** - * d_splice_alias - splice a disconnected dentry into the tree if one exists - * @inode: the inode which may have a disconnected dentry - * @dentry: a negative dentry which we want to point to the inode. - * - * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and - * DCACHE_DISCONNECTED), then d_move that in place of the given dentry - * and return it, else simply d_add the inode to the dentry and return NULL. + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode + * @inode: inode to allocate the dentry for * - * This is needed in the lookup routine of any filesystem that is exportable - * (via knfsd) so that we can build dcache paths to directories effectively. + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). * - * If a dentry was found and moved, then it is returned. Otherwise NULL - * is returned. This matches the expected return value of ->lookup. + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. * - * Cluster filesystems may call this function with a negative, hashed dentry. - * In that case, we know that the inode will be a regular file, and also this - * will only occur during atomic_open. So we need to check for the dentry - * being already hashed only in the final case. + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and the error will be propagated to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ -struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +struct dentry *d_obtain_alias(struct inode *inode) { - struct dentry *new = NULL; - - if (IS_ERR(inode)) - return ERR_CAST(inode); + return __d_obtain_alias(inode, 1); +} +EXPORT_SYMBOL(d_obtain_alias); - if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); - if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&inode->i_lock); - security_d_instantiate(new, inode); - d_move(new, dentry); - iput(inode); - } else { - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(dentry, inode); - d_rehash(dentry); - } - } else { - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) - d_rehash(dentry); - } - return new; +/** + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain an IS_ROOT dentry for the root of a filesystem. + * + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_root(struct inode *inode) +{ + return __d_obtain_alias(inode, 0); } -EXPORT_SYMBOL(d_splice_alias); +EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name @@ -2402,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) } EXPORT_SYMBOL(dentry_update_name_case); -static void switch_names(struct dentry *dentry, struct dentry *target) +static void switch_names(struct dentry *dentry, struct dentry *target, + bool exchange) { if (dname_external(target)) { if (dname_external(dentry)) { @@ -2436,13 +2407,19 @@ static void switch_names(struct dentry *dentry, struct dentry *target) */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + if (!exchange) { + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + dentry->d_name.hash_len = target->d_name.hash_len; + return; + } for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); } } } - swap(dentry->d_name.len, target->d_name.len); + swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) @@ -2472,25 +2449,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) } } -static void dentry_unlock_parents_for_move(struct dentry *dentry, - struct dentry *target) +static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) { if (target->d_parent != dentry->d_parent) spin_unlock(&dentry->d_parent->d_lock); if (target->d_parent != target) spin_unlock(&target->d_parent->d_lock); + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); } /* * When switching names, the actual string doesn't strictly have to * be preserved in the target - because we're dropping the target * anyway. As such, we can just do a simple memcpy() to copy over - * the new name before we switch. - * - * Note that we have to be a lot more careful about getting the hash - * switched - we have to switch the hash value properly even if it - * then no longer matches the actual (corrupted) string of the target. - * The hash value has to match the hash queue that the dentry is on.. + * the new name before we switch, unless we are going to rehash + * it. Note that if we *do* unhash the target, we are not allowed + * to rehash it without giving it a new name/hash key - whether + * we swap or overwrite the names here, resulting name won't match + * the reality in filesystem; it's only there for d_path() purposes. + * Note that all of this is happening under rename_lock, so the + * any hash lookup seeing it in the middle of manipulations will + * be discarded anyway. So we do not care what happens to the hash + * key in that case. */ /* * __d_move - move a dentry @@ -2536,36 +2517,30 @@ static void __d_move(struct dentry *dentry, struct dentry *target, d_hash(dentry->d_parent, dentry->d_name.hash)); } - list_del(&dentry->d_u.d_child); - list_del(&target->d_u.d_child); - /* Switch the names.. */ - switch_names(dentry, target); - swap(dentry->d_name.hash, target->d_name.hash); + switch_names(dentry, target, exchange); - /* ... and switch the parents */ + /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { + /* splicing a tree */ dentry->d_parent = target->d_parent; target->d_parent = target; - INIT_LIST_HEAD(&target->d_u.d_child); + list_del_init(&target->d_u.d_child); + list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); } else { + /* swapping two dentries */ swap(dentry->d_parent, target->d_parent); - - /* And add them back to the (new) parent lists */ - list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); + list_move(&target->d_u.d_child, &target->d_parent->d_subdirs); + list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + if (exchange) + fsnotify_d_move(target); + fsnotify_d_move(dentry); } - list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); - write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); - dentry_unlock_parents_for_move(dentry, target); - if (exchange) - fsnotify_d_move(target); - spin_unlock(&target->d_lock); - fsnotify_d_move(dentry); - spin_unlock(&dentry->d_lock); + dentry_unlock_for_move(dentry, target); } /* @@ -2663,38 +2638,71 @@ out_err: return ret; } -/* - * Prepare an anonymous dentry for life in the superblock's dentry tree as a - * named dentry in place of the dentry to be replaced. - * returns with anon->d_lock held! +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. + * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. */ -static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { - struct dentry *dparent; - - dentry_lock_for_move(anon, dentry); - - write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED); - - dparent = dentry->d_parent; - - switch_names(dentry, anon); - swap(dentry->d_name.hash, anon->d_name.hash); - - dentry->d_parent = dentry; - list_del_init(&dentry->d_u.d_child); - anon->d_parent = dparent; - list_move(&anon->d_u.d_child, &dparent->d_subdirs); - - write_seqcount_end(&dentry->d_seq); - write_seqcount_end(&anon->d_seq); + struct dentry *new = NULL; - dentry_unlock_parents_for_move(anon, dentry); - spin_unlock(&dentry->d_lock); + if (IS_ERR(inode)) + return ERR_CAST(inode); - /* anon->d_lock still locked, returns locked */ + if (inode && S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + new = __d_find_any_alias(inode); + if (new) { + if (!IS_ROOT(new)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } + if (d_ancestor(new, dentry)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } + write_seqlock(&rename_lock); + __d_move(new, dentry, false); + write_sequnlock(&rename_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); + iput(inode); + } else { + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(dentry, inode); + d_rehash(dentry); + } + } else { + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); + } + return new; } +EXPORT_SYMBOL(d_splice_alias); /** * d_materialise_unique - introduce an inode into the tree @@ -2724,7 +2732,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode, 0); + alias = __d_find_alias(inode); if (alias) { actual = alias; write_seqlock(&rename_lock); @@ -2736,9 +2744,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) } else if (IS_ROOT(alias)) { /* Is this an anonymous mountpoint that we * could splice into our tree? */ - __d_materialise_dentry(dentry, alias); + __d_move(alias, dentry, false); write_sequnlock(&rename_lock); - __d_drop(alias); goto found; } else { /* Nope, but we must(!) avoid directory @@ -2764,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_instantiate_unique(dentry, inode); if (!actual) actual = dentry; - else - BUG_ON(!d_unhashed(actual)); - spin_lock(&actual->d_lock); + d_rehash(actual); found: - _d_rehash(actual); - spin_unlock(&actual->d_lock); spin_unlock(&inode->i_lock); out_nolock: if (actual == dentry) { diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 63146295153b..76c08c2beb2f 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -451,7 +451,7 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, { char buf[3]; u32 *val = file->private_data; - + if (*val) buf[0] = 'Y'; else diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8c41b52da358..1e3b99d3db0d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -66,7 +66,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev break; } } - return inode; + return inode; } /* SMP-safe */ @@ -317,7 +317,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, goto exit; /* If the parent is not specified, we create it in the root. - * We need the root dentry to do this, which is in the super + * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ @@ -330,7 +330,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, switch (mode & S_IFMT) { case S_IFDIR: error = debugfs_mkdir(parent->d_inode, dentry, mode); - + break; case S_IFLNK: error = debugfs_link(parent->d_inode, dentry, mode, @@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child, *next, *parent; + struct dentry *child, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: mutex_lock(&parent->d_inode->i_mutex); - list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) { if (!debugfs_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); parent = child; goto down; } - up: + + spin_unlock(&parent->d_lock); + if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * debugfs_positive() check. + */ + goto loop; } + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); child = parent; parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - if (child != dentry) { - next = list_next_entry(child, d_u.d_child); - goto up; - } + if (child != dentry) + /* go up */ + goto loop; if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); diff --git a/fs/direct-io.c b/fs/direct-io.c index 17e39b047de5..e181b6b2e297 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { ssize_t ret; - ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, + ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, &sdio->from); if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8d77ba7b1756..1323c568e362 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = { void dlm_delete_debug_file(struct dlm_ls *ls) { - if (ls->ls_debug_rsb_dentry) - debugfs_remove(ls->ls_debug_rsb_dentry); - if (ls->ls_debug_waiters_dentry) - debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_locks_dentry) - debugfs_remove(ls->ls_debug_locks_dentry); - if (ls->ls_debug_all_dentry) - debugfs_remove(ls->ls_debug_all_dentry); - if (ls->ls_debug_toss_dentry) - debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_rsb_dentry); + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_locks_dentry); + debugfs_remove(ls->ls_debug_all_dentry); + debugfs_remove(ls->ls_debug_toss_dentry); } int dlm_create_debug_file(struct dlm_ls *ls) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index f704458ea5f5..e0ab3a93eeff 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -30,7 +30,7 @@ struct plock_op { struct plock_xop { struct plock_op xop; - void *callback; + int (*callback)(struct file_lock *fl, int result); void *fl; void *file; struct file_lock flc; @@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op) struct file *file; struct file_lock *fl; struct file_lock *flc; - int (*notify)(void *, void *, int) = NULL; + int (*notify)(struct file_lock *fl, int result) = NULL; struct plock_xop *xop = (struct plock_xop *)op; int rv = 0; @@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op) notify = xop->callback; if (op->info.rv) { - notify(fl, NULL, op->info.rv); + notify(fl, op->info.rv); goto out; } @@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op) (unsigned long long)op->info.number, file, fl); } - rv = notify(fl, NULL, 0); + rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ log_print("dlm_plock_callback: lock granted after lock request " diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index db0fad3269c0..b4b6ab9873ae 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file) if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); goto out_free; } if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d4a9431ec73c..1686dc2da9fd 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir) static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) { - if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) - return 1; - return 0; + return ecryptfs_inode_to_lower(inode) == lower_inode; } static int ecryptfs_inode_set(struct inode *inode, void *opaque) @@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - if (IS_ERR(lower_dir_dentry)) { - ecryptfs_printk(KERN_ERR, "Error locking directory of " - "dentry\n"); - inode = ERR_CAST(lower_dir_dentry); - goto out; - } rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " @@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); -out: return inode; } @@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); goto out; } rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); @@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + dentry, rc); return rc; } @@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " - "[%d] on lower_dentry = [%s]\n", __func__, rc, - ecryptfs_dentry->d_name.name); + "[%d] on lower_dentry = [%pd]\n", __func__, rc, + ecryptfs_dentry); goto out; } if (lower_dentry->d_inode) @@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc) + if (!rc && dentry->d_inode) fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); out: return rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 4725a07f003c..635e8e16a5b7 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -26,7 +26,6 @@ */ #include <linux/string.h> -#include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/random.h> @@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, "(Tag 11 not allowed by itself)\n"); rc = -EIO; goto out_wipe_list; - break; default: ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " "of the file header; hex value of " diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index e57380e5f6bd..286f10b0363b 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void) mutex_lock(&ecryptfs_msg_ctx_lists_mux); for (i = 0; i < ecryptfs_message_buf_len; i++) { mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); - if (ecryptfs_msg_ctx_arr[i].msg) - kfree(ecryptfs_msg_ctx_arr[i].msg); + kfree(ecryptfs_msg_ctx_arr[i].msg); mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); } kfree(ecryptfs_msg_ctx_arr); diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 356c044e2cd3..bbee8f063dfa 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -12,7 +12,8 @@ #include "efs.h" -static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { +static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) +{ struct buffer_head *bh; int slot, namelen; @@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { pr_err("%s(): invalid directory block\n", __func__); brelse(bh); - return(0); + return 0; } - for(slot = 0; slot < dirblock->slots; slot++) { + for (slot = 0; slot < dirblock->slots; slot++) { dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); namelen = dirslot->namelen; @@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if ((namelen == len) && (!memcmp(name, nameptr, len))) { inodenum = be32_to_cpu(dirslot->inode); brelse(bh); - return(inodenum); + return inodenum; } } brelse(bh); } - return(0); + return 0; } struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b10b48c2a7af..7bcfff900f05 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - ep_take_care_of_epollwakeup(&epds); + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor diff --git a/fs/exec.c b/fs/exec.c index ab1f1200ce5d..a2b42a98c743 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; - err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7f20f25c232c..84529b8a331b 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); - __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 3750031cfa2f..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->i_block_alloc_info = NULL; @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index e85ff15a060e..fc3cdcf24aed 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -237,6 +237,8 @@ struct ext3_new_group_data { #define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +/* Number of supported quota types */ +#define EXT3_MAXQUOTAS 2 /* * Mount options @@ -248,7 +250,7 @@ struct ext3_mount_options { unsigned long s_commit_interval; #ifdef CONFIG_QUOTA int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[EXT3_MAXQUOTAS]; #endif }; @@ -669,7 +671,7 @@ struct ext3_sb_info { unsigned long s_commit_interval; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif }; @@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations; #define EXT3_QUOTA_INIT_BLOCKS(sb) 0 #define EXT3_QUOTA_DEL_BLOCKS(sb) 0 #endif -#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) int ext3_mark_iloc_dirty(handle_t *handle, diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 08cdfe5461e3..7015db0bafd1 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb) percpu_counter_destroy(&sbi->s_dirs_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif @@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { if (EXT3_SB(sb)->s_qf_names[i]) { int ret = ext3_quota_on_mount(sb, i); if (ret < 0) @@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); @@ -2139,7 +2139,7 @@ failed_mount2: kfree(sbi->s_group_desc); failed_mount: #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif ext3_blkdev_remove(sbi); @@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) old_opts.s_commit_interval = sbi->s_commit_interval; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], GFP_KERNEL); @@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) } #ifdef CONFIG_QUOTA /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (enable_quota) @@ -2777,7 +2777,7 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } @@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) */ overhead += ngroups * (2 + sbi->s_itb_per_group); - /* Add the journal blocks as well */ - overhead += sbi->s_journal->j_maxlen; + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) + overhead += sbi->s_journal->j_maxlen; sbi->s_overhead_last = overhead; smp_wmb(); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fca382037ddd..581ef40fbe90 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -639,7 +639,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, if (!(*errp) && ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef1bed66c14f..0bb3f9ea0832 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -571,6 +571,31 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EIO; + + return 0; +} + const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7cc5a0e23688..b0c225cdb52c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -591,7 +591,6 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 -#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -1826,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) /* * Special error return code only used by dx_probe() and its callers. */ -#define ERR_BAD_DX_DIR -75000 +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* * Timeout and state flag for lazy initialization inode thread. @@ -2029,6 +2028,8 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) return ext4_filetype_table[filetype]; } +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); @@ -2144,8 +2145,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); -extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2453,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2560,7 +2577,6 @@ extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ -extern int ext4_has_inline_data(struct inode *inode); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, @@ -2626,6 +2642,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4da228a0e6d0..74292a71b384 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ @@ -1808,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | - EXT4_FREE_BLOCKS_RESERVE); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* @@ -3253,7 +3254,7 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -4664,7 +4665,8 @@ retry: } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, - ext4_lblk_t len, int flags, int mode) + ext4_lblk_t len, loff_t new_size, + int flags, int mode) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4673,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int retries = 0; struct ext4_map_blocks map; unsigned int credits; + loff_t epos; map.m_lblk = offset; + map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple @@ -4689,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, credits = ext4_chunk_trans_blocks(inode, len); retry: - while (ret >= 0 && ret < len) { - map.m_lblk = map.m_lblk + ret; - map.m_len = len = len - ret; + while (ret >= 0 && len) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4708,6 +4710,21 @@ retry: ret2 = ext4_journal_stop(handle); break; } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) break; @@ -4730,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t new_size = 0; int ret = 0; int flags; - int partial; + int credits; + int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; struct address_space *mapping = inode->i_mapping; @@ -4770,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (start < offset || end > offset + len) return -EINVAL; - partial = (offset + len) & ((1 << blkbits) - 1); + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); @@ -4804,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * If we have a partial block after EOF we have to allocate * the entire block. */ - if (partial) + if (partial_end) max_blocks += 1; } @@ -4812,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); @@ -4824,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_dio; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, - mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out_dio; } + if (!partial_begin && !partial_end) + goto out_dio; - handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); @@ -4838,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - if (new_size) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); + ext4_update_inode_size(inode, new_size); } else { /* * Mark that we allocate beyond EOF so the subsequent truncate @@ -4852,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } - ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ @@ -4879,13 +4903,11 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - handle_t *handle; loff_t new_size = 0; unsigned int max_blocks; int ret = 0; int flags; ext4_lblk_t lblk; - struct timespec tv; unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ @@ -4936,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - goto out; - - tv = inode->i_ctime = ext4_current_time(inode); - - if (new_size) { - if (new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - inode->i_mtime = tv; - } - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if ((offset + len) > i_size_read(inode)) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } - ext4_mark_inode_dirty(handle, inode); - if (file->f_flags & O_SYNC) - ext4_handle_sync(handle); - - ext4_journal_stop(handle); out: mutex_unlock(&inode->i_mutex); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); @@ -5403,16 +5404,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) int ret; /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || - len & (EXT4_BLOCK_SIZE(sb) - 1)) + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) - return -EOPNOTSUPP; - trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8695f70af1ef..aca7b24a4432 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,10 +200,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; return 0; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index fd69da194826..e75f840000a0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1295,97 +1295,220 @@ do_indirects: } } -static int free_hole_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, __le32 *i_data, - int level, ext4_lblk_t first, - ext4_lblk_t count, int max) +/** + * ext4_ind_remove_space - remove space from the range + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @start: First block to remove + * @end: One block after the last block to remove (exclusive) + * + * Free the blocks in the defined range (end is exclusive endpoint of + * range). This is used by ext4_punch_hole(). + */ +int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end) { - struct buffer_head *bh = NULL; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ret = 0; - int i, inc; - ext4_lblk_t offset; - __le32 blk; - - inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); - for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { - if (offset >= count + first) - break; - if (*i_data == 0 || (offset + inc) <= first) - continue; - blk = *i_data; - if (level > 0) { - ext4_lblk_t first2; - ext4_lblk_t count2; + ext4_lblk_t offsets[4], offsets2[4]; + Indirect chain[4], chain2[4]; + Indirect *partial, *partial2; + ext4_lblk_t max_block; + __le32 nr = 0, nr2 = 0; + int n = 0, n2 = 0; + unsigned blocksize = inode->i_sb->s_blocksize; - bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), - "Read failure"); - return -EIO; - } - if (first > offset) { - first2 = first - offset; - count2 = count; + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + if (end >= max_block) + end = max_block; + if ((start >= end) || (start > max_block)) + return 0; + + n = ext4_block_to_path(inode, start, offsets, NULL); + n2 = ext4_block_to_path(inode, end, offsets2, NULL); + + BUG_ON(n > n2); + + if ((n == 1) && (n == n2)) { + /* We're punching only within direct block range */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + offsets2[0]); + return 0; + } else if (n2 > n) { + /* + * Start and end are on a different levels so we're going to + * free partial block at start, and partial block at end of + * the range. If there are some levels in between then + * do_indirects label will take care of that. + */ + + if (n == 1) { + /* + * Start is at the direct block level, free + * everything to the end of the level. + */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto end_range; + } + + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; } else { - first2 = 0; - count2 = count - (offset - first); + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); } - ret = free_hole_blocks(handle, inode, bh, - (__le32 *)bh->b_data, level - 1, - first2, count2, - inode->i_sb->s_blocksize >> 2); - if (ret) { - brelse(bh); - goto err; + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + +end_range: + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + if (nr2) { + if (partial2 == chain2) { + /* + * Remember, end is exclusive so here we're at + * the start of the next level we're not going + * to free. Everything was covered by the start + * of the range. + */ + return 0; + } else { + /* Shared branch grows from an indirect block */ + partial2--; } + } else { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; } - if (level == 0 || - (bh && all_zeroes((__le32 *)bh->b_data, - (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, - i_data, i_data + 1); + + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + while (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } - brelse(bh); - bh = NULL; + goto do_indirects; } -err: - return ret; -} - -int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) -{ - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int level, ret = 0; - int num = EXT4_NDIR_BLOCKS; - ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; - __le32 *i_data = EXT4_I(inode)->i_data; - - count = stop - first; - for (level = 0; level < 4; level++, max *= addr_per_block) { - if (first < max) { - ret = free_hole_blocks(handle, inode, NULL, i_data, - level, first, count, num); - if (ret) - goto err; - if (count > max - first) - count -= max - first; - else - break; - first = 0; - } else { - first -= max; + /* Punch happened within the same level (n == n2) */ + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + while ((partial > chain) || (partial2 > chain2)) { + /* We're at the same block, so we're almost finished */ + if ((partial->bh && partial2->bh) && + (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { + if ((partial > chain) && (partial2 > chain2)) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + partial2->p, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } + return 0; } - i_data += num; - if (level == 0) { - num = 1; - max = 1; + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + if (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + if (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } } -err: - return ret; +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + if (++n >= n2) + return 0; + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + return 0; } - diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 645205d8ada6..bea662bd0ca6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -120,12 +120,6 @@ int ext4_get_max_inline_size(struct inode *inode) return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } -int ext4_has_inline_data(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && - EXT4_I(inode)->i_inline_off; -} - /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before @@ -1178,6 +1172,18 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (error < 0) goto out; + /* + * Make sure the inline directory entries pass checks before we try to + * convert them, so that we avoid touching stuff that needs fsck. + */ + if (S_ISDIR(inode->i_mode)) { + error = ext4_check_all_de(inode, iloc->bh, + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (error) + goto out; + } + error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a064734e6eb..3aa26e9117c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -325,18 +325,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) #endif /* - * Calculate the number of metadata blocks need to reserve - * to allocate a block located at @lblock - */ -static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_calc_metadata_amount(inode, lblock); - - return ext4_ind_calc_metadata_amount(inode, lblock); -} - -/* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ @@ -357,35 +345,10 @@ void ext4_da_update_reserve_space(struct inode *inode, used = ei->i_reserved_data_blocks; } - if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { - ext4_warning(inode->i_sb, "ino %lu, allocated %d " - "with only %d reserved metadata blocks " - "(releasing %d blocks with reserved %d data blocks)", - inode->i_ino, ei->i_allocated_meta_blocks, - ei->i_reserved_meta_blocks, used, - ei->i_reserved_data_blocks); - WARN_ON(1); - ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; - } - /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; - ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - used + ei->i_allocated_meta_blocks); - ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); /* Update quota subsystem for data blocks */ @@ -1092,27 +1055,11 @@ static int ext4_write_end(struct file *file, } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hole i_mutex. - * - * But it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */ - if (pos + copied > inode->i_size) { - i_size_write(inode, pos + copied); - i_size_changed = 1; - } - - if (pos + copied > EXT4_I(inode)->i_disksize) { - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * but greater than i_disksize. (hint delalloc) - */ - ext4_update_i_disksize(inode, (pos + copied)); - i_size_changed = 1; - } + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); page_cache_release(page); @@ -1160,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file, int ret = 0, ret2; int partial = 0; unsigned from, to; - loff_t new_i_size; + int size_changed = 0; trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1183,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - new_i_size = pos + copied; - if (new_i_size > inode->i_size) - i_size_write(inode, pos+copied); + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); + unlock_page(page); + page_cache_release(page); + + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } - unlock_page(page); - page_cache_release(page); if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1222,49 +1167,6 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a metadata for a single block located at lblock - */ -static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; - ext4_lblk_t save_last_lblock; - int save_len; - - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ - spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); - - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; - spin_unlock(&ei->i_block_reservation_lock); - return -ENOSPC; - } - ei->i_reserved_meta_blocks += md_needed; - spin_unlock(&ei->i_block_reservation_lock); - - return 0; /* success */ -} - -/* * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1273,8 +1175,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; int ret; - ext4_lblk_t save_last_lblock; - int save_len; /* * We will charge metadata quota at writeout time; this saves @@ -1295,25 +1195,15 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * ext4_calc_metadata_amount() has side effects, which we have * to be prepared undo if we fail to claim space. */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); + md_needed = 0; + trace_ext4_da_reserve_space(inode, 0); - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; + if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; - ei->i_reserved_meta_blocks += md_needed; spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1346,20 +1236,6 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } ei->i_reserved_data_blocks -= to_free; - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - * Note that in case of bigalloc, i_reserved_meta_blocks, - * i_reserved_data_blocks, etc. refer to number of clusters. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } - /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); @@ -1500,10 +1376,6 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); - ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - ei->i_reserved_meta_blocks); - ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", - ei->i_allocated_meta_blocks); return; } @@ -1620,13 +1492,6 @@ add_delayed: retval = ret; goto out_unlock; } - } else { - ret = ext4_da_reserve_metadata(inode, iblock); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2212,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; + int progress = 0; mpd->io_submit.io_end->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; @@ -2228,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle, * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || - (err == -ENOSPC && ext4_count_free_clusters(sb))) + (err == -ENOSPC && ext4_count_free_clusters(sb))) { + if (progress) + goto update_disksize; return err; + } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" @@ -2246,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle, *give_up_on_write = true; return err; } + progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) - return err; + goto update_disksize; } while (map->m_len); +update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. @@ -2843,8 +2714,7 @@ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); - if (!EXT4_I(inode)->i_reserved_data_blocks && - !EXT4_I(inode)->i_reserved_meta_blocks) + if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* @@ -3624,7 +3494,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else - ret = ext4_free_hole_blocks(handle, inode, first_block, + ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2dcb936be90e..8b0f9ef517d6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int last = first + count - 1; struct super_block *sb = e4b->bd_sb; + if (WARN_ON(count == 0)) + return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ @@ -3075,8 +3077,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { - start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; - size = ac->ac_o_ex.fe_len << bsbits; + start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; + size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; @@ -3216,8 +3219,30 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; + struct ext4_buddy e4b; + int err; - if (pa && pa->pa_type == MB_INODE_PA) + if (pa == NULL) { + if (ac->ac_f_ex.fe_len == 0) + return; + err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); + if (err) { + /* + * This should never happen since we pin the + * pages in the ext4_allocation_context so + * ext4_mb_load_buddy() should never fail. + */ + WARN(1, "mb_load_buddy failed (%d)", err); + return; + } + ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, + ac->ac_f_ex.fe_len); + ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + ext4_mb_unload_buddy(&e4b); + return; + } + if (pa->pa_type == MB_INODE_PA) pa->pa_free += ac->ac_b_ex.fe_len; } @@ -4627,7 +4652,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4838,19 +4862,7 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { - percpu_counter_add(&sbi->s_dirtyclusters_counter, - count_clusters); - spin_lock(&ei->i_block_reservation_lock); - if (flags & EXT4_FREE_BLOCKS_METADATA) - ei->i_reserved_meta_blocks += count_clusters; - else - ei->i_reserved_data_blocks += count_clusters; - spin_unlock(&ei->i_block_reservation_lock); - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_reclaim_block(inode, - EXT4_C2B(sbi, count_clusters)); - } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ec092437d3e0..d3567f27bae7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,6 +39,8 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); + /* Locking only for convinience since we are operating on temp inode */ + down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -61,7 +63,9 @@ static int finish_range(handle_t *handle, struct inode *inode, */ if (needed && ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS)) { + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } else if (needed) { @@ -70,13 +74,16 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * IF not able to extend the journal restart the journal */ + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } } retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: + up_write((&EXT4_I(inode)->i_data_sem)); if (path) { ext4_ext_drop_refs(path); kfree(path); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2484c7ec6a72..671a74b14fd7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1013,10 +1013,11 @@ data_copy: *err = -EBUSY; goto unlock_pages; } - + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3520ab8a6639..603e4ebbd0ac 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err; + int i, err = 0; int namelen; *res_dir = NULL; @@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (bh || (err != ERR_BAD_DX_DIR)) + if (err == -ENOENT) + return NULL; + if (err && err != ERR_BAD_DX_DIR) + return ERR_PTR(err); + if (bh) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1295,6 +1299,11 @@ restart: } num++; bh = ext4_getblk(NULL, dir, b++, 0, &err); + if (unlikely(err)) { + if (ra_max == 0) + return ERR_PTR(err); + break; + } bh_use[ra_max] = bh; if (bh) ll_rw_block(READ | REQ_META | REQ_PRIO, @@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_rmdir; @@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_unlink; @@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); @@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, return retval; } -static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, + int force_reread) { int retval; /* @@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, - ent->de->name_len)) { + ent->de->name_len) || + force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { @@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, .dentry = new_dentry, .inode = new_dentry->d_inode, }; + int force_reread; int retval; dquot_initialize(old.dir); @@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(new.inode); old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto end_rename; + } if (new.bh) { if (!new.inode) { brelse(new.bh); @@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + /* + * If we're renaming a file within an inline_data dir and adding or + * setting the new dirent causes a conversion from inline_data to + * extents/blockmap, we need to force the dirent delete code to + * re-read the directory, or else we end up trying to delete a dirent + * from what is now the extent tree root (or a block map). + */ + force_reread = (new.dir->i_ino == old.dir->i_ino && + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) @@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + if (force_reread) + force_reread = !ext4_test_inode_flag(new.dir, + EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a @@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, /* * ok, that's it */ - ext4_rename_delete(handle, &old); + ext4_rename_delete(handle, &old, force_reread); if (new.inode) { ext4_dec_count(handle, new.inode); @@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto end_rename; + } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) @@ -3455,7 +3503,6 @@ const struct inode_operations ext4_dir_inode_operations = { .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, - .rename = ext4_rename, .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bb0e80f03e2e..1e43b905ff98 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -575,6 +575,7 @@ handle_bb: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } overhead = ext4_group_overhead_blocks(sb, group); @@ -603,6 +604,7 @@ handle_ib: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6df7bc611dbd..05c159218bc2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2142,10 +2142,6 @@ static int ext4_check_descriptors(struct super_block *sb, } if (NULL != first_not_zeroed) *first_not_zeroed = grp; - - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, ext4_count_free_clusters(sb))); - sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -3185,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb) if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { - /* journal checksum v2 */ + /* journal checksum v3 */ compat = 0; - incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; @@ -3209,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); } @@ -3883,13 +3880,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); goto failed_mount2; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - if (!ext4_fill_flex_info(sb)) { - ext4_msg(sb, KERN_ERR, - "unable to initialize " - "flex_bg meta info!"); - goto failed_mount2; - } sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -3902,22 +3892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - err = percpu_counter_init(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); - } - if (!err) { - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); - } + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; @@ -4022,18 +3997,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - /* - * The journal may have updated the bg summary counts, so we - * need to update the global counters. - */ - percpu_counter_set(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - percpu_counter_set(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - percpu_counter_set(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); - no_journal: if (ext4_mballoc_ready) { sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); @@ -4141,6 +4104,36 @@ no_journal: goto failed_mount5; } + block = ext4_count_free_clusters(sb); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount6; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount6; + } + err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; @@ -4215,6 +4208,12 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); @@ -4233,12 +4232,6 @@ failed_mount_wq: failed_mount3: ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); - if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -4556,11 +4549,13 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) + ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeclusters_counter))); - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 214fe1054fce..736a348509f7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -23,7 +23,7 @@ config F2FS_STAT_FS mounted as f2fs. Each file shows the whole f2fs information. /sys/kernel/debug/f2fs/status includes: - - major file system information managed by f2fs currently + - major filesystem information managed by f2fs currently - average SIT information about whole segments - current memory footprint consumed by f2fs. @@ -68,6 +68,6 @@ config F2FS_CHECK_FS bool "F2FS consistency checking feature" depends on F2FS_FS help - Enables BUG_ONs which check the file system consistency in runtime. + Enables BUG_ONs which check the filesystem consistency in runtime. If you want to improve the performance, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index dbe2141d10ad..83b9b5a8d112 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; - if (acl) { - error = posix_acl_valid(acl); - if (error < 0) - return error; - } - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0b4710c1d370..dd10a031c052 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -22,7 +22,7 @@ #include "segment.h" #include <trace/events/f2fs.h> -static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *ino_entry_slab; static struct kmem_cache *inode_entry_slab; /* @@ -72,7 +72,22 @@ out: return page; } -static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index) +{ + bool readahead = false; + struct page *page; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || (page && !PageUptodate(page))) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + return get_meta_page(sbi, index); +} + +static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type) { switch (type) { case META_NAT: @@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) case META_SSA: case META_CP: return 0; + case META_POR: + return MAX_BLKADDR(sbi); default: BUG(); } @@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) { block_t prev_blk_addr = 0; struct page *page; - int blkno = start; - int max_blks = get_max_meta_blks(sbi, type); + block_t blkno = start; + block_t max_blks = get_max_meta_blks(sbi, type); struct f2fs_io_info fio = { .type = META, @@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) break; case META_SSA: case META_CP: - /* get ssa/cp block addr */ + case META_POR: + if (unlikely(blkno >= max_blks)) + goto out; + if (unlikely(blkno < SEG0_BLKADDR(sbi))) + goto out; blk_addr = blkno; break; default: @@ -151,8 +172,7 @@ out: static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); trace_f2fs_writepage(page, META); @@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page, goto redirty_out; if (wbc->for_reclaim) goto redirty_out; - - /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) - goto no_write; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); -no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -180,7 +197,7 @@ redirty_out: static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; trace_f2fs_writepages(mapping->host, wbc, META); @@ -262,15 +279,12 @@ continue_unlock: static int f2fs_set_meta_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - trace_f2fs_set_page_dirty(page, META); SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_META); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); return 1; } return 0; @@ -282,78 +296,126 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; +retry: + spin_lock(&sbi->ino_lock[type]); + + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (!e) { + e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); + if (!e) { + spin_unlock(&sbi->ino_lock[type]); + goto retry; + } + if (radix_tree_insert(&sbi->ino_root[type], ino, e)) { + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + goto retry; + } + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &sbi->ino_list[type]); + } + spin_unlock(&sbi->ino_lock[type]); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; + + spin_lock(&sbi->ino_lock[type]); + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[type], ino); + if (type == ORPHAN_INO) + sbi->n_orphans--; + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&sbi->ino_lock[type]); +} + +void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, type); +} + +void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct ino_entry *e; + spin_lock(&sbi->ino_lock[mode]); + e = radix_tree_lookup(&sbi->ino_root[mode], ino); + spin_unlock(&sbi->ino_lock[mode]); + return e ? true : false; +} + +void release_dirty_inode(struct f2fs_sb_info *sbi) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = APPEND_INO; i <= UPDATE_INO; i++) { + spin_lock(&sbi->ino_lock[i]); + list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[i], e->ino); + kmem_cache_free(ino_entry_slab, e); + } + spin_unlock(&sbi->ino_lock[i]); + } +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->orphan_inode_lock); - f2fs_bug_on(sbi->n_orphans == 0); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); + f2fs_bug_on(sbi, sbi->n_orphans == 0); sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *new, *orphan; - - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, new); - return; - } - - if (orphan->ino > ino) - break; - } - - /* add new orphan entry into list which is sorted by inode number */ - list_add_tail(&new->list, &orphan->list); - spin_unlock(&sbi->orphan_inode_lock); + /* add new orphan ino entry into list */ + __add_ino_entry(sbi, ino, ORPHAN_INO); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *orphan; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - list_del(&orphan->list); - f2fs_bug_on(sbi->n_orphans == 0); - sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, orphan); - return; - } - } - spin_unlock(&sbi->orphan_inode_lock); + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode = f2fs_iget(sbi->sb, ino); - f2fs_bug_on(IS_ERR(inode)); + f2fs_bug_on(sbi, IS_ERR(inode)); clear_nlink(inode); /* truncate all the data during iput */ @@ -398,23 +460,23 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; unsigned short index; - unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + - (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + unsigned short orphan_blocks = + (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans); struct page *page = NULL; - struct orphan_inode_entry *orphan = NULL; + struct ino_entry *orphan = NULL; for (index = 0; index < orphan_blocks; index++) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; + spin_lock(&sbi->ino_lock[ORPHAN_INO]); + head = &sbi->ino_list[ORPHAN_INO]; /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { page = find_get_page(META_MAPPING(sbi), start_blk++); - f2fs_bug_on(!page); + f2fs_bug_on(sbi, !page); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); @@ -448,7 +510,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) f2fs_put_page(page, 1); } - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -574,7 +636,7 @@ fail_no_cp: static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) return -EEXIST; @@ -586,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) return 0; } -void set_dirty_dir_page(struct inode *inode, struct page *page) +void update_dirty_page(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *new; int ret = 0; - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) return; + if (!S_ISDIR(inode->i_mode)) { + inode_inc_dirty_pages(inode); + goto out; + } + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); ret = __add_dirty_inode(inode, new); - inode_inc_dirty_dents(inode); - SetPagePrivate(page); + inode_inc_dirty_pages(inode); spin_unlock(&sbi->dir_inode_lock); if (ret) kmem_cache_free(inode_entry_slab, new); +out: + SetPagePrivate(page); } void add_dirty_dir_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); int ret = 0; @@ -629,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (get_dirty_dents(inode) || + if (get_dirty_pages(inode) || !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -689,7 +757,7 @@ retry: /* * Freeze all the FS-operations for checkpoint. */ -static void block_operations(struct f2fs_sb_info *sbi) +static int block_operations(struct f2fs_sb_info *sbi) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -697,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi) .for_reclaim = 0, }; struct blk_plug plug; + int err = 0; blk_start_plug(&plug); @@ -706,27 +775,38 @@ retry_flush_dents: if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); sync_dirty_dir_inodes(sbi); + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } goto retry_flush_dents; } /* - * POR: we should ensure that there is no dirty node pages + * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. */ retry_flush_nodes: - mutex_lock(&sbi->node_write); + down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); sync_node_pages(sbi, 0, &wbc); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_unlock_all(sbi); + err = -EIO; + goto out; + } goto retry_flush_nodes; } +out: blk_finish_plug(&plug); + return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -745,10 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - nid_t last_nid = 0; + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; @@ -761,11 +843,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi); + discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) + while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return; + } next_free_nid(sbi, &last_nid); @@ -776,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); - for (i = 0; i < 3; i++) { + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); ckpt->cur_node_blkoff[i] = @@ -784,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); } - for (i = 0; i < 3; i++) { + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { ckpt->cur_data_segno[i] = cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); ckpt->cur_data_blkoff[i] = @@ -799,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi); - if (data_sum_blocks < 3) + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) - / F2FS_ORPHANS_PER_BLOCK; + orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks); } @@ -826,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) else clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + if (sbi->need_fsck) + set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); @@ -860,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -875,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return; + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -885,26 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { - clear_prefree_segments(sbi); - F2FS_RESET_SB_DIRT(sbi); - } + release_dirty_inode(sbi); + + if (unlikely(f2fs_cp_error(sbi))) + return; + + clear_prefree_segments(sbi); + F2FS_RESET_SB_DIRT(sbi); } /* - * We guarantee that this checkpoint procedure should not fail. + * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + if (!sbi->s_dirty && cpc->reason != CP_DISCARD) + goto out; + if (unlikely(f2fs_cp_error(sbi))) + goto out; + if (block_operations(sbi)) + goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_submit_merged_bio(sbi, DATA, WRITE); f2fs_submit_merged_bio(sbi, NODE, WRITE); @@ -920,43 +1019,49 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* write cached NAT/SIT entries to NAT/SIT area */ flush_nat_entries(sbi); - flush_sit_entries(sbi); + flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, is_umount); + do_checkpoint(sbi, cpc); unblock_operations(sbi); - mutex_unlock(&sbi->cp_mutex); - stat_inc_cp_count(sbi->stat_info); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); +out: + mutex_unlock(&sbi->cp_mutex); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); } -void init_orphan_info(struct f2fs_sb_info *sbi) +void init_ino_entry_info(struct f2fs_sb_info *sbi) { - spin_lock_init(&sbi->orphan_inode_lock); - INIT_LIST_HEAD(&sbi->orphan_inode_list); - sbi->n_orphans = 0; + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC); + spin_lock_init(&sbi->ino_lock[i]); + INIT_LIST_HEAD(&sbi->ino_list[i]); + } + /* * considering 512 blocks in a segment 8 blocks are needed for cp * and log segment summaries. Remaining blocks are used to keep * orphan entries with the limitation one reserved segment * for cp pack we can have max 1020*504 orphan entries */ - sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) - * F2FS_ORPHANS_PER_BLOCK; + sbi->n_orphans = 0; + sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { - orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry)); - if (!orphan_entry_slab) + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; @@ -964,6 +1069,6 @@ int __init create_checkpoint_caches(void) void destroy_checkpoint_caches(void) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8cf619edb5f..8e58c4cc2cb9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) struct page *page = bvec->bv_page; if (unlikely(err)) { - SetPageError(page); + set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi); } @@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = bio_alloc(GFP_NOIO, npages); bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = sbi; @@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + if (test_opt(sbi, NOBARRIER)) + io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + else + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); up_write(&io->io_rwsem); @@ -190,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int bio_blocks = MAX_BIO_BLOCKS(sbi); io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); io->fio = *fio; @@ -233,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) int reserve_new_block(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; @@ -255,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int err; /* if inode_page exists, index should be zero */ - f2fs_bug_on(!need_put && index); + f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index); err = get_dnode_of_data(dn, index, ALLOC_NODE); if (err) @@ -318,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) block_t start_blkaddr, end_blkaddr; int need_update = true; - f2fs_bug_on(blk_addr == NEW_ADDR); + f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; @@ -393,7 +396,6 @@ end_update: struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; @@ -426,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr, sync ? READ_SYNC : READA); if (err) return ERR_PTR(err); @@ -448,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) */ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; @@ -487,7 +488,8 @@ repeat: return page; } - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, + dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); @@ -514,7 +516,6 @@ repeat: struct page *get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct page *page; struct dnode_of_data dn; @@ -538,8 +539,8 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, - READ_SYNC); + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, + dn.data_blkaddr, READ_SYNC); if (err) goto put_err; @@ -570,10 +571,12 @@ put_err: static int __allocate_data_block(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; block_t new_blkaddr; struct node_info ni; + pgoff_t fofs; int type; if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) @@ -596,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn) update_extent_cache(new_blkaddr, dn); clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + /* update i_size */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + dn->data_blkaddr = new_blkaddr; return 0; } @@ -611,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create, bool fiemap) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; @@ -626,8 +634,10 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (check_extent_cache(inode, pgofs, bh_result)) goto out; - if (create) - f2fs_lock_op(sbi); + if (create) { + f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_lock_op(F2FS_I_SB(inode)); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -686,7 +696,7 @@ get_next: allocated = true; blkaddr = dn.data_blkaddr; } - /* Give more consecutive addresses for the read ahead */ + /* Give more consecutive addresses for the readahead */ if (blkaddr == (bh_result->b_blocknr + ofs)) { ofs++; dn.ofs_in_node++; @@ -702,7 +712,7 @@ put_out: f2fs_put_dnode(&dn); unlock_out: if (create) - f2fs_unlock_op(sbi); + f2fs_unlock_op(F2FS_I_SB(inode)); out: trace_f2fs_get_data_block(inode, iblock, bh_result, err); return err; @@ -734,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) trace_f2fs_readpage(page, DATA); - /* If the file has inline data, try to read it directlly */ + /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); else @@ -784,9 +794,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) !is_cold_data(page) && need_inplace_update(inode))) { rewrite_data_page(page, old_blkaddr, fio); + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); } else { write_data_page(page, &dn, &new_blkaddr, fio); update_extent_cache(new_blkaddr, &dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); } out_writepage: f2fs_put_dnode(&dn); @@ -797,7 +809,7 @@ static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; @@ -829,10 +841,19 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; err = do_write_data_page(page, &fio); goto done; } + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + SetPageError(page); + unlock_page(page); + goto out; + } + if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) @@ -850,7 +871,7 @@ done: clear_cold_data(page); out: - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); @@ -876,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); bool locked = false; int ret; long diff; @@ -888,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, return 0; if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && - get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; @@ -910,16 +931,26 @@ static int f2fs_write_data_pages(struct address_space *mapping, return ret; skip_write: - wbc->pages_skipped += get_dirty_dents(inode); + wbc->pages_skipped += get_dirty_pages(inode); return 0; } +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + truncate_blocks(inode, inode->i_size, true); + } +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; @@ -929,13 +960,15 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_balance_fs(sbi); repeat: - err = f2fs_convert_inline_data(inode, pos + len); + err = f2fs_convert_inline_data(inode, pos + len, NULL); if (err) - return err; + goto fail; page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto fail; + } /* to avoid latency during memory pressure */ unlock_page(page); @@ -949,10 +982,9 @@ repeat: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); - if (err) { f2fs_put_page(page, 0); - return err; + goto fail; } inline_data: lock_page(page); @@ -982,19 +1014,20 @@ inline_data: err = f2fs_read_inline_data(inode, page); if (err) { page_cache_release(page); - return err; + goto fail; } } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) - return err; + goto fail; } lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return -EIO; + err = -EIO; + goto fail; } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); @@ -1005,6 +1038,9 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; +fail: + f2fs_write_failed(mapping, pos + len); + return err; } static int f2fs_write_end(struct file *file, @@ -1016,8 +1052,10 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - SetPageUptodate(page); - set_page_dirty(page); + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + register_inmem_page(inode, page); + else + set_page_dirty(page); if (pos + copied > i_size_read(inode)) { i_size_write(inode, pos + copied); @@ -1050,7 +1088,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + int err; /* Let buffer I/O handle the inline data case. */ if (f2fs_has_inline_data(inode)) @@ -1059,19 +1100,27 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, if (check_direct_IO(inode, rw, iter, offset)) return 0; - /* clear fsync mark to recover these blocks */ - fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); + if (err < 0 && (rw & WRITE)) + f2fs_write_failed(mapping, offset + count); + + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - return blockdev_direct_IO(rw, iocb, inode, iter, offset, - get_data_block); + return err; } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; + + if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE) + return; + if (PageDirty(page)) - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); ClearPagePrivate(page); } @@ -1093,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - set_dirty_dir_page(inode, page); + update_dirty_page(inode, page); return 1; } return 0; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b52c12cf5873..0a91ab813a9e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); int i; - /* valid check of the segment numbers */ + /* validation check of the segment numbers */ si->hit_ext = sbi->read_hit_ext; si->total_ext = sbi->total_hit_ext; si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; + dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build sit */ si->base_mem += sizeof(struct sit_info); - si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); - si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); - /* buld nm */ + /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); @@ -167,7 +167,7 @@ get_cache: si->cache_mem += npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); + si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); } @@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void) f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - goto bail; + return; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); - if (!file) - goto free_debugfs_dir; - - return; - -free_debugfs_dir: - debugfs_remove(f2fs_debugfs_root); - -bail: - f2fs_debugfs_root = NULL; - return; + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + } } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a4addd72ebbd..b54f87149c09 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(const char *name, size_t namelen, - f2fs_hash_t namehash, struct f2fs_dir_entry *de) +static bool early_match_name(size_t namelen, f2fs_hash_t namehash, + struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) return false; @@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, size_t namelen, int *max_slots, + struct qstr *name, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, continue; } de = &dentry_blk->dentry[bit_pos]; - if (early_match_name(name, namelen, namehash, de)) { + if (early_match_name(name->len, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], - name, namelen)) { + name->name, + name->len)) { *res_page = dentry_page; goto found; } @@ -120,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, *max_slots = max_len; max_len = 0; } + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs has occurred. + */ + f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -132,10 +140,10 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, size_t namelen, + unsigned int level, struct qstr *name, f2fs_hash_t namehash, struct page **res_page) { - int s = GET_DENTRY_SLOTS(namelen); + int s = GET_DENTRY_SLOTS(name->len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; @@ -143,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, bool room = false; int max_slots = 0; - f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); + f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -160,8 +168,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, continue; } - de = find_in_block(dentry_page, name, namelen, - &max_slots, namehash, res_page); + de = find_in_block(dentry_page, name, &max_slots, + namehash, res_page); if (de) break; @@ -187,8 +195,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { - const char *name = child->name; - size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -200,12 +206,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, *res_page = NULL; - name_hash = f2fs_dentry_hash(name, namelen); + name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, name, - namelen, name_hash, res_page); + de = find_in_level(dir, level, child, name_hash, res_page); if (de) break; } @@ -279,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) int update_dent_inode(struct inode *inode, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; - page = get_node_page(sbi, inode->i_ino); + page = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -298,14 +302,13 @@ static int make_empty_dir(struct inode *inode, struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; - void *kaddr; dentry_page = get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); @@ -323,7 +326,7 @@ static int make_empty_dir(struct inode *inode, test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); @@ -337,7 +340,7 @@ static struct page *init_inode_metadata(struct inode *inode, int err; if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - page = new_inode_page(inode, name); + page = new_inode_page(inode); if (IS_ERR(page)) return page; @@ -355,14 +358,15 @@ static struct page *init_inode_metadata(struct inode *inode, if (err) goto put_error; } else { - page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; set_cold_node(inode, page); } - init_dent_inode(name, page); + if (name) + init_dent_inode(name, page); /* * This file should be checkpointed during fsync. @@ -370,6 +374,12 @@ static struct page *init_inode_metadata(struct inode *inode, */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); inc_nlink(inode); } return page; @@ -379,7 +389,7 @@ put_error: error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0); + truncate_blocks(inode, 0, false); remove_dirty_dir_inode(inode); remove_inode_page(inode); return ERR_PTR(err); @@ -453,7 +463,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name->name, name->len); + dentry_hash = f2fs_dentry_hash(name); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -529,8 +539,29 @@ fail: return err; } +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + /* we don't need to mark_inode_dirty now */ + update_inode(inode, page); + f2fs_put_page(page, 1); + + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + return err; +} + /* - * It only removes the dentry from the dentry page,corresponding name + * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, @@ -538,17 +569,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; - struct address_space *mapping = page->mapping; - struct inode *dir = mapping->host; + struct inode *dir = page->mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - void *kaddr = page_address(page); int i; lock_page(page); f2fs_wait_on_page_writeback(page, DATA); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); @@ -562,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); @@ -589,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - inode_dec_dirty_dents(dir); + inode_dec_dirty_pages(dir); } f2fs_put_page(page, 1); } @@ -603,7 +632,6 @@ bool f2fs_empty_dir(struct inode *dir) unsigned long nblock = dir_blocks(dir); for (bidx = 0; bidx < nblock; bidx++) { - void *kaddr; dentry_page = get_lock_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) @@ -612,8 +640,8 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -621,7 +649,7 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58df97e174d0..8171e80b2ee9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,16 @@ #include <linux/sched.h> #ifdef CONFIG_F2FS_CHECK_FS -#define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) #define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else -#define f2fs_bug_on(condition) +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (unlikely(condition)) { \ + WARN_ON(1); \ + sbi->need_fsck = true; \ + } \ + } while (0) #define f2fs_down_write(x, y) down_write(x) #endif @@ -41,6 +47,7 @@ #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 #define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -89,6 +96,20 @@ enum { SIT_BITMAP }; +enum { + CP_UMOUNT, + CP_SYNC, + CP_DISCARD, +}; + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; + __u64 trimmed; +}; + /* * For CP/NAT/SIT/SSA readahead */ @@ -96,11 +117,19 @@ enum { META_CP, META_NAT, META_SIT, - META_SSA + META_SSA, + META_POR, +}; + +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + MAX_INO_ENTRY, /* max. list */ }; -/* for the list of orphan inodes */ -struct orphan_inode_entry { +struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ }; @@ -122,7 +151,9 @@ struct discard_entry { struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ - block_t blkaddr; /* block address locating the last inode */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ + block_t last_inode; /* block address locating the last inode */ }; #define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) @@ -133,6 +164,9 @@ struct fsync_inode_entry { #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) + static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) { int before = nats_in_cursum(rs); @@ -147,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) return before; } +static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, + int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(sum); + return size <= MAX_SIT_JENTRIES(sum); +} + /* * ioctl commands */ -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS + +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -214,13 +261,16 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_dents; /* # of dirty dentry pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ + + struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct mutex inmem_lock; /* lock for inmemory pages */ }; static inline void get_extent_info(struct extent_info *ext, @@ -252,10 +302,11 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ rwlock_t nat_tree_lock; /* protect nat_tree_lock */ - unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + unsigned int nat_cnt; /* the # of cached nat entries */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -322,18 +373,16 @@ enum { }; struct flush_cmd { - struct flush_cmd *next; struct completion wait; + struct llist_node llnode; int ret; }; struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - struct flush_cmd *issue_list; /* list for command issue */ - struct flush_cmd *dispatch_list; /* list for command dispatch */ - spinlock_t issue_lock; /* for issue list lock */ - struct flush_cmd *issue_tail; /* list tail of issue list */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ }; struct f2fs_sm_info { @@ -359,8 +408,11 @@ struct f2fs_sm_info { int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ + struct list_head sit_entry_set; /* sit entry set list */ + unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ struct flush_cmd_control *cmd_control_info; @@ -385,7 +437,7 @@ enum count_type { }; /* - * The below are the page types of bios used in submti_bio(). + * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. @@ -424,6 +476,7 @@ struct f2fs_sb_info { struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int s_dirty; /* dirty flag for checkpoint */ + bool need_fsck; /* need fsck.f2fs to fix */ /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ @@ -442,14 +495,17 @@ struct f2fs_sb_info { struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct mutex node_write; /* locking node writes */ + struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; - /* for orphan inode management */ - struct list_head orphan_inode_list; /* orphan inode list */ - spinlock_t orphan_inode_lock; /* for orphan inode list */ + /* for inode management */ + struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */ + spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ + struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ + + /* for orphan inode, use 0'th array */ unsigned int n_orphans; /* # of orphan inodes */ unsigned int max_orphans; /* max orphan inodes */ @@ -457,7 +513,7 @@ struct f2fs_sb_info { struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ - /* basic file system units */ + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ unsigned int blocksize; /* block size */ @@ -526,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) return sb->s_fs_info; } +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) +{ + return F2FS_M_SB(page->mapping); +} + static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); @@ -690,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t count) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(inode->i_blocks < count); + f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(sbi, inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); @@ -703,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) F2FS_SET_SB_DIRT(sbi); } -static inline void inode_inc_dirty_dents(struct inode *inode) +static inline void inode_inc_dirty_pages(struct inode *inode) { - inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); - atomic_inc(&F2FS_I(inode)->dirty_dents); + atomic_inc(&F2FS_I(inode)->dirty_pages); + if (S_ISDIR(inode->i_mode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -714,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) atomic_dec(&sbi->nr_pages[count_type]); } -static inline void inode_dec_dirty_dents(struct inode *inode) +static inline void inode_dec_dirty_pages(struct inode *inode) { - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) return; - dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); - atomic_dec(&F2FS_I(inode)->dirty_dents); + atomic_dec(&F2FS_I(inode)->dirty_pages); + + if (S_ISDIR(inode->i_mode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -728,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline int get_dirty_dents(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_dents); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) @@ -768,7 +842,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else - return ((unsigned char *)ckpt + F2FS_BLKSIZE); + return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; @@ -786,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) /* * odd numbered checkpoint should at cp segment 0 - * and even segent must be at cp segment 1 + * and even segment must be at cp segment 1 */ if (!(ckpt_version & 1)) start_addr += sbi->blocks_per_seg; @@ -835,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(!sbi->total_valid_block_count); - f2fs_bug_on(!sbi->total_valid_node_count); - f2fs_bug_on(!inode->i_blocks); + f2fs_bug_on(sbi, !sbi->total_valid_block_count); + f2fs_bug_on(sbi, !sbi->total_valid_node_count); + f2fs_bug_on(sbi, !inode->i_blocks); inode->i_blocks--; sbi->total_valid_node_count--; @@ -854,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); + f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); sbi->total_valid_inode_count++; spin_unlock(&sbi->stat_lock); } @@ -862,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(!sbi->total_valid_inode_count); + f2fs_bug_on(sbi, !sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); } @@ -878,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) return; if (unlock) { - f2fs_bug_on(!PageLocked(page)); + f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } page_cache_release(page); @@ -983,11 +1057,17 @@ enum { FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_VOLATILE_FILE, /* indicate volatile file */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) { - set_bit(flag, &fi->flags); + if (!test_bit(flag, &fi->flags)) + set_bit(flag, &fi->flags); } static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) @@ -997,7 +1077,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) { - clear_bit(flag, &fi->flags); + if (test_bit(flag, &fi->flags)) + clear_bit(flag, &fi->flags); } static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) @@ -1067,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); } +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_volatile_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); +} + static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); @@ -1078,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); +} + static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) { set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); @@ -1099,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64); +int truncate_blocks(struct inode *, u64, bool); void f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); @@ -1118,6 +1214,7 @@ void update_inode(struct inode *, struct page *); void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); +void handle_failed_inode(struct inode *); /* * namei.c @@ -1136,6 +1233,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_do_tmpfile(struct inode *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); @@ -1155,7 +1253,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, size_t); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c @@ -1164,16 +1262,16 @@ struct dnode_of_data; struct node_info; bool available_free_memory(struct f2fs_sb_info *, int); -int is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool fsync_mark_done(struct f2fs_sb_info *, nid_t); -void fsync_mark_clear(struct f2fs_sb_info *, nid_t); +bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool has_fsynced_inode(struct f2fs_sb_info *, nid_t); +bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); void remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); @@ -1183,9 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -void recover_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, struct node_info *, block_t); -bool recover_xattr_data(struct inode *, struct page *, block_t); +void recover_inline_xattr(struct inode *, struct page *); +void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); @@ -1198,6 +1295,8 @@ void destroy_node_manager_caches(void); /* * segment.c */ +void register_inmem_page(struct inode *, struct page *); +void commit_inmem_pages(struct inode *, bool); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); @@ -1206,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *); +void release_discard_addrs(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); +int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, @@ -1218,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *, void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); -void rewrite_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); @@ -1227,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *); +void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); int __init create_segment_manager_caches(void); @@ -1238,20 +1337,25 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -int ra_meta_pages(struct f2fs_sb_info *, int, int, int); +struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void release_dirty_inode(struct f2fs_sb_info *); +bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); -void set_dirty_dir_page(struct inode *, struct page *); +void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool); -void init_orphan_info(struct f2fs_sb_info *); +void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1295,7 +1399,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *); struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; - struct mutex stat_lock; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; int hit_ext, total_ext; @@ -1335,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_SB(inode->i_sb))->inline_inode++); \ + ((F2FS_I_SB(inode))->inline_inode++); \ } while (0) #define stat_dec_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_SB(inode->i_sb))->inline_inode--); \ + ((F2FS_I_SB(inode))->inline_inode--); \ } while (0) #define stat_inc_seg_type(sbi, curseg) \ @@ -1417,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations; */ bool f2fs_may_inline(struct inode *); int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *); int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); void truncate_inline_data(struct inode *, u64); -int recover_inline_data(struct inode *, struct page *); +bool recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d8b96275092..8e68bb64f835 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; int err; @@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, sb_start_pagefault(inode->i_sb); + /* force to convert with normal data indices */ + err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page); + if (err) + goto out; + /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } +static inline bool need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool need_cp = false; + + if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) + need_cp = true; + else if (file_wrong_pino(inode)) + need_cp = true; + else if (!space_for_roll_forward(sbi)) + need_cp = true; + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; + + return need_cp; +} + int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -127,32 +152,47 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return 0; trace_f2fs_sync_file_enter(inode); + + /* if fdatasync is triggered, let's do in-place-update */ + if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + clear_inode_flag(fi, FI_NEED_IPU); + if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + !exist_written_data(sbi, ino, APPEND_INO)) { + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) { + f2fs_put_page(i, 0); + goto go_write; + } + f2fs_put_page(i, 0); + + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + exist_written_data(sbi, ino, UPDATE_INO)) + goto flush_out; + goto out; + } +go_write: /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - down_read(&fi->i_sem); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; - else if (file_wrong_pino(inode)) - need_cp = true; - else if (!space_for_roll_forward(sbi)) - need_cp = true; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; - + down_read(&fi->i_sem); + need_cp = need_do_checkpoint(inode); up_read(&fi->i_sem); if (need_cp) { @@ -176,19 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&fi->i_sem); } } else { - /* if there is no written node page, write its inode page */ - while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { - if (fsync_mark_done(sbi, inode->i_ino)) - goto out; +sync_nodes: + sync_node_pages(sbi, ino, &wbc); + + if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, inode->i_ino); + + ret = wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; - ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); + ret = f2fs_issue_flush(F2FS_I_SB(inode)); } out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -206,8 +255,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, /* find first dirty page index */ pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, + PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -262,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { - /* direct node is not exist */ + /* direct node does not exists */ if (whence == SEEK_DATA) { pgofs = PGOFS_OF_NEXT_DNODE(pgofs, F2FS_I(inode)); @@ -272,8 +322,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -315,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_DATA: case SEEK_HOLE: + if (offset < 0) + return -ENXIO; return f2fs_seek_block(file, offset, whence); } @@ -331,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; __le32 *addr; @@ -380,19 +431,21 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) { - f2fs_put_page(page, 1); - return; - } + if (unlikely(!PageUptodate(page) || + page->mapping != inode->i_mapping)) + goto out; + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); + +out: f2fs_put_page(page, 1); } -int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from, bool lock) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; @@ -406,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from) free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - f2fs_lock_op(sbi); + if (lock) + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - f2fs_unlock_op(sbi); + if (lock) + f2fs_unlock_op(sbi); trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -421,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from) count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); count -= dn.ofs_in_node; - f2fs_bug_on(count < 0); + f2fs_bug_on(sbi, count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); @@ -431,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - f2fs_unlock_op(sbi); + if (lock) + f2fs_unlock_op(sbi); done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -448,7 +504,7 @@ void f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); - if (!truncate_blocks(inode, i_size_read(inode))) { + if (!truncate_blocks(inode, i_size_read(inode), true)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } @@ -504,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - err = f2fs_convert_inline_data(inode, attr->ia_size); + if (attr->ia_valid & ATTR_SIZE) { + err = f2fs_convert_inline_data(inode, attr->ia_size, NULL); if (err) return err; - truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); - f2fs_balance_fs(F2FS_SB(inode->i_sb)); + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + f2fs_truncate(inode); + f2fs_balance_fs(F2FS_I_SB(inode)); + } else { + /* + * giving a chance to truncate blocks past EOF which + * are fallocated with FALLOC_FL_KEEP_SIZE. + */ + f2fs_truncate(inode); + } } __setattr_copy(inode, attr); @@ -546,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = { static void fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; if (!len) @@ -595,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; - ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + /* skip punching hole beyond i_size */ + if (offset >= inode->i_size) + return ret; + + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); if (ret) return ret; @@ -618,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); f2fs_balance_fs(sbi); @@ -639,17 +709,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t index, pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_start, off_end; int ret = 0; + f2fs_balance_fs(sbi); + ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - ret = f2fs_convert_inline_data(inode, offset + len); + ret = f2fs_convert_inline_data(inode, offset + len, NULL); if (ret) return ret; @@ -733,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) return flags & F2FS_OTHER_FLMASK; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags; + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); +} + +static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int oldflags; int ret; - switch (cmd) { - case F2FS_IOC_GETFLAGS: - flags = fi->i_flags & FS_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case F2FS_IOC_SETFLAGS: - { - unsigned int oldflags; + ret = mnt_want_write_file(filp); + if (ret) + return ret; - ret = mnt_want_write_file(filp); - if (ret) - return ret; + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out; + } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } + if (get_user(flags, (int __user *)arg)) { + ret = -EFAULT; + goto out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); - if (get_user(flags, (int __user *) arg)) { - ret = -EFAULT; + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; goto out; } + } + + flags = flags & FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + fi->i_flags = flags; + mutex_unlock(&inode->i_mutex); - flags = f2fs_mask_flags(inode->i_mode, flags); + f2fs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); +out: + mnt_drop_write_file(filp); + return ret; +} - mutex_lock(&inode->i_mutex); +static int f2fs_ioc_start_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - oldflags = fi->i_flags; + if (!inode_owner_or_capable(inode)) + return -EACCES; - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - ret = -EPERM; - goto out; - } - } + f2fs_balance_fs(sbi); - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - f2fs_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); -out: - mnt_drop_write_file(filp); + return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (f2fs_is_volatile_file(inode)) + return 0; + + ret = mnt_want_write_file(filp); + if (ret) return ret; - } + + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, false); + + ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_start_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + return 0; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + return 0; +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC_GETFLAGS: + return f2fs_ioc_getflags(filp, arg); + case F2FS_IOC_SETFLAGS: + return f2fs_ioc_setflags(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + return f2fs_ioc_start_volatile_write(filp); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b90dbe55403a..2a8f4acdb86b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -58,7 +58,7 @@ static int gc_thread_func(void *data) * 3. IO subsystem is idle by checking the # of requests in * bdev's request list. * - * Note) We have to avoid triggering GCs too much frequently. + * Note) We have to avoid triggering GCs frequently. * Because it is possible that some segments can be * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. @@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int hint = 0; unsigned int secno; /* @@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ -next: - secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); - if (secno < TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) - goto next; + continue; clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -225,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) u = (vblocks * 100) >> sbi->log_blocks_per_seg; - /* Handle if the system time is changed by user */ + /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) sit_i->min_mtime = mtime; if (mtime > sit_i->max_mtime) @@ -266,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int secno, max_cost; int nsearched = 0; + mutex_lock(&dirty_i->seglist_lock); + p.alloc_mode = alloc_mode; select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); - mutex_lock(&dirty_i->seglist_lock); - if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -284,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, - TOTAL_SEGS(sbi), p.offset); - if (segno >= TOTAL_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); + if (segno >= MAIN_SEGS(sbi)) { if (sbi->last_victim[p.gc_mode]) { sbi->last_victim[p.gc_mode] = 0; p.offset = 0; @@ -426,6 +422,12 @@ next_step: if (IS_ERR(node_page)) continue; + /* block may become invalid during get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + /* set page dirty and write it */ if (gc_type == FG_GC) { f2fs_wait_on_page_writeback(node_page, NODE); @@ -534,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) f2fs_wait_on_page_writeback(page, DATA); if (clear_page_dirty_for_io(page)) - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); @@ -596,7 +598,7 @@ next_step: if (phase == 2) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) continue; start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); @@ -691,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi) int gc_type = BG_GC; int nfree = 0; int ret = -1; + struct cp_control cpc = { + .reason = CP_SYNC, + }; INIT_LIST_HEAD(&ilist); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(f2fs_cp_error(sbi))) goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) @@ -726,7 +731,7 @@ gc_more: goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5d5eb6047bf4..16f0b2b22999 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) block_t invalid_user_blocks = sbi->user_block_count - written_block_count(sbi); /* - * Background GC is triggered with the following condition. + * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6eb8d269b53b..a844fcfb9a8d 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[]) buf[1] += b1; } -static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) { unsigned pad, val; int i; @@ -69,12 +70,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; - const char *p; + const unsigned char *p; __u32 in[8], buf[4]; + const unsigned char *name = name_info->name; + size_t len = name_info->len; if ((len <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1bba5228c197..88036fd75797 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -15,11 +15,13 @@ bool f2fs_may_inline(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t nr_blocks; loff_t i_size; - if (!test_opt(sbi, INLINE_DATA)) + if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) + return false; + + if (f2fs_is_atomic_file(inode)) return false; nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; @@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode) int f2fs_read_inline_data(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *ipage; void *src_addr, *dst_addr; @@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) goto out; } - ipage = get_node_page(sbi, inode->i_ino); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { unlock_page(page); return PTR_ERR(ipage); @@ -68,12 +69,12 @@ out: static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) { - int err; + int err = 0; struct page *ipage; struct dnode_of_data dn; void *src_addr, *dst_addr; block_t new_blk_addr; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_io_info fio = { .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, @@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) goto out; } + /* someone else converted inline_data already */ + if (!f2fs_has_inline_data(inode)) + goto out; + /* * i_addr[0] is not used for inline data, * so reserving new block will not destroy inline data @@ -124,9 +129,10 @@ out: return err; } -int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size, + struct page *page) { - struct page *page; + struct page *new_page = page; int err; if (!f2fs_has_inline_data(inode)) @@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) else if (to_size <= MAX_INLINE_DATA) return 0; - page = grab_cache_page(inode->i_mapping, 0); - if (!page) - return -ENOMEM; + if (!page || page->index != 0) { + new_page = grab_cache_page(inode->i_mapping, 0); + if (!new_page) + return -ENOMEM; + } - err = __f2fs_convert_inline_data(inode, page); - f2fs_put_page(page, 1); + err = __f2fs_convert_inline_data(inode, new_page); + if (!page || page->index != 0) + f2fs_put_page(new_page, 1); return err; } int f2fs_write_inline_data(struct inode *inode, - struct page *page, unsigned size) + struct page *page, unsigned size) { void *src_addr, *dst_addr; struct page *ipage; @@ -172,6 +181,7 @@ int f2fs_write_inline_data(struct inode *inode, stat_inc_inline_inode(inode); } + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); sync_inode_page(&dn); f2fs_put_dnode(&dn); @@ -180,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode, void truncate_inline_data(struct inode *inode, u64 from) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *ipage; if (from >= MAX_INLINE_DATA) return; - ipage = get_node_page(sbi, inode->i_ino); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return; @@ -198,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from) f2fs_put_page(ipage, 1); } -int recover_inline_data(struct inode *inode, struct page *npage) +bool recover_inline_data(struct inode *inode, struct page *npage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; void *src_addr, *dst_addr; struct page *ipage; @@ -217,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage) ri = F2FS_INODE(npage); if (f2fs_has_inline_data(inode) && - ri && ri->i_inline & F2FS_INLINE_DATA) { + ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE); @@ -229,22 +238,22 @@ process_inline: memcpy(dst_addr, src_addr, MAX_INLINE_DATA); update_inode(inode, ipage); f2fs_put_page(ipage, 1); - return -1; + return true; } if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE); zero_user_segment(ipage, INLINE_DATA_OFFSET, INLINE_DATA_OFFSET + MAX_INLINE_DATA); clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); update_inode(inode, ipage); f2fs_put_page(ipage, 1); - } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { - truncate_blocks(inode, 0); + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { + truncate_blocks(inode, 0, false); set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); goto process_inline; } - return 0; + return false; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2cf6962f6cc8..0deead4505e7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) static int do_read_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; @@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page) void update_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; retry: node_page = get_node_page(sbi, inode->i_ino); @@ -238,7 +238,7 @@ retry: int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -266,16 +266,21 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) */ void f2fs_evict_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + commit_inmem_pages(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) - goto no_delete; + goto out_clear; - f2fs_bug_on(get_dirty_dents(inode)); + f2fs_bug_on(sbi, get_dirty_pages(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) @@ -295,6 +300,36 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: - clear_inode(inode); invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); +out_clear: + clear_inode(inode); +} + +/* caller should call f2fs_lock_op() */ +void handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + clear_nlink(inode); + make_bad_inode(inode); + unlock_new_inode(inode); + + i_size_write(inode, 0); + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + remove_inode_page(inode); + stat_dec_inline_inode(inode); + + alloc_nid_failed(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a6bdddc33ce2..0d2526e5aa11 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/dcache.h> #include "f2fs.h" #include "node.h" @@ -22,14 +23,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; struct inode *inode; bool nid_free = false; int err; - inode = new_inode(sb); + inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -102,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; nid_t ino = 0; int err; @@ -124,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, ino); @@ -134,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, unlock_new_inode(inode); return 0; out: - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, ino); + handle_failed_inode(inode); return err; } @@ -146,8 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; f2fs_balance_fs(sbi); @@ -158,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); d_instantiate(dentry, inode); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); iput(inode); + f2fs_unlock_op(sbi); return err; } @@ -207,8 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode = dentry->d_inode; struct f2fs_dir_entry *de; struct page *page; @@ -232,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ + /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: trace_f2fs_unlink_exit(inode, err); @@ -242,8 +236,7 @@ fail: static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t symlen = strlen(symname) + 1; int err; @@ -259,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); err = page_symlink(inode, symname, symlen); alloc_nid_done(sbi, inode->i_ino); @@ -270,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); return err; out: - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; @@ -298,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out_fail; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); @@ -311,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } @@ -330,8 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) static int f2fs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err = 0; @@ -349,28 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); unlock_new_inode(inode); return 0; out: - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; @@ -393,8 +372,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - f2fs_lock_op(sbi); - if (new_inode) { err = -ENOTEMPTY; @@ -407,6 +384,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -435,9 +414,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_lock_op(sbi); + err = f2fs_add_link(new_dentry, old_inode); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out_dir; + } if (old_dir_entry) { inc_nlink(new_dir); @@ -472,6 +455,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: + f2fs_unlock_op(sbi); kunmap(new_page); f2fs_put_page(new_page, 0); out_dir: @@ -479,7 +463,150 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } +out_old: + kunmap(old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) + goto out_old; + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) + goto out_new; + } + + if (S_ISDIR(new_inode->i_mode)) { + err = -EIO; + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) + goto out_old_dir; + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_lock_op(sbi); + + err = update_dent_inode(old_inode, &new_dentry->d_name); + if (err) + goto out_unlock; + + err = update_dent_inode(new_inode, &old_dentry->d_name); + if (err) + goto out_undo; + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + update_inode_page(old_inode); + + old_dir->i_ctime = CURRENT_TIME; + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + if (old_nlink < 0) + drop_nlink(old_dir); + else + inc_nlink(old_dir); + up_write(&F2FS_I(old_dir)->i_sem); + } + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + file_lost_pino(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + update_inode_page(new_inode); + + new_dir->i_ctime = CURRENT_TIME; + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + if (new_nlink < 0) + drop_nlink(new_dir); + else + inc_nlink(new_dir); + up_write(&F2FS_I(new_dir)->i_sem); + } + mark_inode_dirty(new_dir); + update_inode_page(new_dir); + f2fs_unlock_op(sbi); + return 0; +out_undo: + /* Still we may fail to recover name info of f2fs_inode here */ + update_dent_inode(old_inode, &old_dentry->d_name); +out_unlock: + f2fs_unlock_op(sbi); +out_new_dir: + if (new_dir_entry) { + kunmap(new_dir_page); + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_new: + kunmap(new_page); + f2fs_put_page(new_page, 0); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -487,6 +614,66 @@ out: return err; } +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + handle_failed_inode(inode); + return err; +} + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -496,7 +683,8 @@ const struct inode_operations f2fs_dir_inode_operations = { .mkdir = f2fs_mkdir, .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, - .rename = f2fs_rename, + .rename2 = f2fs_rename2, + .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b697ccc9b0c..44b8afef43d9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -25,6 +25,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; bool available_free_memory(struct f2fs_sb_info *sbi, int type) { @@ -53,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); unsigned int long flags; if (PageDirty(page)) { @@ -64,7 +64,7 @@ static void clear_node_page_dirty(struct page *page) spin_unlock_irqrestore(&mapping->tree_lock, flags); clear_page_dirty_for_io(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); + dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); } ClearPageUptodate(page); } @@ -90,12 +90,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_meta_page(sbi, src_off); - - /* Dirty src_page means that it is already the new target NAT page. */ - if (PageDirty(src_page)) - return src_page; - dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -127,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) kmem_cache_free(nat_entry_slab, e); } -int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + if (get_nat_flag(ne, IS_DIRTY)) + return; +retry: + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + + if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { + cond_resched(); + goto retry; + } + } + list_move_tail(&ne->list, &head->entry_list); + nm_i->dirty_nat_cnt++; + head->entry_cnt++; + set_nat_flag(ne, IS_DIRTY, true); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (head) { + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + head->entry_cnt--; + nm_i->dirty_nat_cnt--; + } +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - int is_cp = 1; + bool is_cp = true; read_lock(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (e && !e->checkpointed) - is_cp = 0; + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; read_unlock(&nm_i->nat_tree_lock); return is_cp; } -bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool fsync_done = false; + bool fsynced = false; read_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (e) - fsync_done = e->fsync_done; + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) + fsynced = true; read_unlock(&nm_i->nat_tree_lock); - return fsync_done; + return fsynced; } -void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + bool need_update = true; - write_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (e) - e->fsync_done = false; - write_unlock(&nm_i->nat_tree_lock); + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + read_unlock(&nm_i->nat_tree_lock); + return need_update; } static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) @@ -180,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); - new->checkpointed = true; + nat_reset_flag(new); list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -219,7 +270,7 @@ retry: goto retry; } e->ni = *ni; - f2fs_bug_on(ni->blk_addr == NEW_ADDR); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, @@ -227,20 +278,20 @@ retry: * So, reinitialize it with new information. */ e->ni = *ni; - f2fs_bug_on(ni->blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } /* sanity check */ - f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); - f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); - /* increament version no as node is removed */ + /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); @@ -248,12 +299,17 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); + if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); /* update fsync_mark if its inode nat entry is still alive */ e = __lookup_nat_cache(nm_i, ni->ino); - if (e) - e->fsync_done = fsync_done; + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } write_unlock(&nm_i->nat_tree_lock); } @@ -277,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } /* - * This function returns always success + * This function always returns success */ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { @@ -414,7 +470,7 @@ got: */ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; struct page *parent; int offset[4]; @@ -507,15 +563,15 @@ release_out: static void truncate_node(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; get_node_info(sbi, dn->nid, &ni); if (dn->inode->i_blocks == 0) { - f2fs_bug_on(ni.blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); goto invalidate; } - f2fs_bug_on(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -543,14 +599,13 @@ invalidate: static int truncate_dnode(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *page; if (dn->nid == 0) return 1; /* get direct node */ - page = get_node_page(sbi, dn->nid); + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) @@ -567,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn) static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, int ofs, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct dnode_of_data rdn = *dn; struct page *page; struct f2fs_node *rn; @@ -581,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(sbi, dn->nid); + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); @@ -639,7 +693,6 @@ out_err: static int truncate_partial_nodes(struct dnode_of_data *dn, struct f2fs_inode *ri, int *offset, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *pages[2]; nid_t nid[3]; nid_t child_nid; @@ -653,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { - /* refernece count'll be increased */ - pages[i] = get_node_page(sbi, nid[i]); + /* reference count'll be increased */ + pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); idx = i - 1; @@ -699,7 +752,7 @@ fail: */ int truncate_inode_blocks(struct inode *inode, pgoff_t from) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; @@ -795,7 +848,7 @@ fail: int truncate_xattr_node(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; struct page *npage; @@ -826,26 +879,31 @@ int truncate_xattr_node(struct inode *inode, struct page *page) */ void remove_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - nid_t ino = inode->i_ino; struct dnode_of_data dn; - page = get_node_page(sbi, ino); - if (IS_ERR(page)) + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) return; - if (truncate_xattr_node(inode, page)) { - f2fs_put_page(page, 1); + if (truncate_xattr_node(inode, dn.inode_page)) { + f2fs_put_dnode(&dn); return; } - /* 0 is possible, after f2fs_new_inode() is failed */ - f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); - set_new_dnode(&dn, inode, page, page, ino); + + /* remove potential inline_data blocks */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + truncate_data_blocks_range(&dn, 1); + + /* 0 is possible, after f2fs_new_inode() has failed */ + f2fs_bug_on(F2FS_I_SB(inode), + inode->i_blocks != 0 && inode->i_blocks != 1); + + /* will put inode & node pages */ truncate_node(&dn); } -struct page *new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -859,7 +917,7 @@ struct page *new_inode_page(struct inode *inode, const struct qstr *name) struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info old_ni, new_ni; struct page *page; int err; @@ -879,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn, get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ - f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); @@ -917,7 +975,7 @@ fail: */ static int read_node_page(struct page *page, int rw) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; get_node_info(sbi, page->index, &ni); @@ -993,7 +1051,7 @@ got_it: */ struct page *get_node_page_ra(struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); struct blk_plug plug; struct page *page; int err, i, end; @@ -1123,17 +1181,24 @@ continue_unlock: /* called by fsync() */ if (ino && IS_DNODE(page)) { - int mark = !is_checkpointed_node(sbi, ino); set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, mark); + if (IS_INODE(page)) { + if (!is_checkpointed_node(sbi, ino) && + !has_fsynced_inode(sbi, ino)) + set_dentry_mark(page, 1); + else + set_dentry_mark(page, 0); + } nwritten++; } else { set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); - wrote++; + + if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + unlock_page(page); + else + wrote++; if (--wbc->nr_to_write == 0) break; @@ -1202,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; block_t new_addr; struct node_info ni; @@ -1215,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); - f2fs_bug_on(page->index != nid); + f2fs_bug_on(sbi, page->index != nid); get_node_info(sbi, nid, &ni); @@ -1234,12 +1301,12 @@ static int f2fs_write_node_page(struct page *page, if (wbc->for_reclaim) goto redirty_out; - mutex_lock(&sbi->node_write); + down_read(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - mutex_unlock(&sbi->node_write); + up_read(&sbi->node_write); unlock_page(page); return 0; @@ -1251,7 +1318,7 @@ redirty_out: static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff; trace_f2fs_writepages(mapping->host, wbc, NODE); @@ -1276,15 +1343,12 @@ skip_write: static int f2fs_set_node_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - trace_f2fs_set_page_dirty(page, NODE); SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_NODES); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); return 1; } @@ -1295,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); if (PageDirty(page)) - dec_page_count(sbi, F2FS_DIRTY_NODES); + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES); ClearPagePrivate(page); } @@ -1350,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (ne && - (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) + (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1407,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - f2fs_bug_on(blk_addr == NEW_ADDR); + f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) { if (add_free_nid(sbi, start_nid, true) < 0) break; @@ -1477,12 +1541,12 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(list_empty(&nm_i->free_nid_list)); + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - f2fs_bug_on(i->state != NID_NEW); + f2fs_bug_on(sbi, i->state != NID_NEW); *nid = i->nid; i->state = NID_ALLOC; nm_i->fcnt--; @@ -1508,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(!i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); @@ -1529,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(!i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; @@ -1543,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_summary *sum, struct node_info *ni, - block_t new_blkaddr) -{ - rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr, false); - clear_node_page_dirty(page); -} - -static void recover_inline_xattr(struct inode *inode, struct page *page) +void recover_inline_xattr(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; size_t inline_size; struct page *ipage; struct f2fs_inode *ri; - if (!f2fs_has_inline_xattr(inode)) - return; - - if (!IS_INODE(page)) - return; + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); - if (!(ri->i_inline & F2FS_INLINE_XATTR)) - return; - - ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + goto update_inode; + } dst_addr = inline_xattr_addr(ipage); src_addr = inline_xattr_addr(page); @@ -1579,30 +1629,25 @@ static void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(ipage, NODE); memcpy(dst_addr, src_addr, inline_size); - +update_inode: update_inode(inode, ipage); f2fs_put_page(ipage, 1); } -bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; - recover_inline_xattr(inode, page); - - if (!f2fs_has_xattr_block(ofs_of_node(page))) - return false; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; /* Deallocate node address */ get_node_info(sbi, prev_xnid, &ni); - f2fs_bug_on(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -1610,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: allocate new xattr nid */ if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(1); + f2fs_bug_on(sbi, 1); remove_free_nid(NM_I(sbi), new_xnid); get_node_info(sbi, new_xnid, &ni); @@ -1623,7 +1668,6 @@ recover_xnid: set_node_addr(sbi, &ni, blkaddr, false); update_inode_page(inode); - return true; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1642,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (!ipage) return -ENOMEM; - /* Should not use this inode from free nid list */ + /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); SetPageUptodate(ipage); @@ -1656,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; new_ni = old_ni; new_ni.ino = ino; @@ -1664,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return 0; } /* * ra_sum_pages() merge contiguous pages into one bio and submit. - * these pre-readed pages are alloced in bd_inode's mapping tree. + * these pre-read pages are allocated in bd_inode's mapping tree. */ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, int start, int nrpages) @@ -1702,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_summary *sum_entry; struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int bio_blocks = MAX_BIO_BLOCKS(sbi); struct page *pages[bio_blocks]; int i, idx, last_offset, nrpages, err = 0; @@ -1714,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); - /* read ahead node pages */ + /* readahead node pages */ nrpages = ra_sum_pages(sbi, pages, addr, nrpages); if (!nrpages) return -ENOMEM; @@ -1744,7 +1790,7 @@ skip: return err; } -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1752,12 +1798,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) int i; mutex_lock(&curseg->curseg_mutex); - - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { - mutex_unlock(&curseg->curseg_mutex); - return false; - } - for (i = 0; i < nats_in_cursum(sum); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; @@ -1767,107 +1807,147 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) retry: write_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne) { - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - continue; - } + if (ne) + goto found; + ne = grab_nat_entry(nm_i, nid); if (!ne) { write_unlock(&nm_i->nat_tree_lock); goto retry; } node_info_from_raw_nat(&ne->ni, &raw_ne); +found: __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); - return true; } -/* - * This function is called during the checkpointing process. - */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) +{ + struct nat_entry_set *cur; + + if (nes->entry_cnt >= max) + goto add_out; + + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } + } +add_out: + list_add_tail(&nes->set_list, head); +} + +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set) { - struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nat_block *nat_blk = NULL; - nid_t start_nid = 0, end_nid = 0; - bool flushed; - flushed = flush_nats_in_journal(sbi); + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; - if (!flushed) + if (to_journal) { mutex_lock(&curseg->curseg_mutex); + } else { + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } - /* 1) flush dirty nat caches */ - list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { - nid_t nid; - struct f2fs_nat_entry raw_ne; - int offset = -1; + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; if (nat_get_blkaddr(ne) == NEW_ADDR) continue; - nid = nat_get_nid(ne); + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); - if (flushed) - goto to_nat_page; + write_lock(&NM_I(sbi)->nat_tree_lock); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), ne); + write_unlock(&NM_I(sbi)->nat_tree_lock); - /* if there is room for nat enries in curseg->sumpage */ - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); - if (offset >= 0) { - raw_ne = nat_in_journal(sum, offset); - goto flush_now; - } -to_nat_page: - if (!page || (start_nid > nid || nid > end_nid)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; - } - start_nid = START_NID(nid); - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + if (nat_get_blkaddr(ne) == NULL_ADDR) + add_free_nid(sbi, nid, false); + } - /* - * get nat block with dirty flag, increased reference - * count, mapped and lock - */ - page = get_next_nat_page(sbi, start_nid); - nat_blk = page_address(page); - } + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); - f2fs_bug_on(!nat_blk); - raw_ne = nat_blk->entries[nid - start_nid]; -flush_now: - raw_nat_from_node_info(&raw_ne, &ne->ni); + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } +} - if (offset < 0) { - nat_blk->entries[nid - start_nid] = raw_ne; - } else { - nat_in_journal(sum, offset) = raw_ne; - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } +/* + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct nat_entry_set *setvec[NATVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + if (!nm_i->dirty_nat_cnt) + return; - if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, NATVEC_SIZE, setvec))) { + unsigned idx; + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(sum)); } - if (!flushed) - mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(page, 1); + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) + __flush_nat_entry_set(sbi, set); + + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1886,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - 3; + nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; @@ -1894,8 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); @@ -1944,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(i->state == NID_ALLOC); + f2fs_bug_on(sbi, i->state == NID_ALLOC); __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->free_nid_list_lock); } - f2fs_bug_on(nm_i->fcnt); + f2fs_bug_on(sbi, nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ @@ -1963,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) for (idx = 0; idx < found; idx++) __del_from_nat_cache(nm_i, natvec[idx]); } - f2fs_bug_on(nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); @@ -1976,19 +2056,30 @@ int __init create_node_manager_caches(void) nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) - return -ENOMEM; + goto fail; free_nid_slab = f2fs_kmem_cache_create("free_nid", sizeof(struct free_nid)); - if (!free_nid_slab) { - kmem_cache_destroy(nat_entry_slab); - return -ENOMEM; - } + if (!free_nid_slab) + goto destory_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destory_free_nid; return 0; + +destory_free_nid: + kmem_cache_destroy(free_nid_slab); +destory_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; } void destroy_node_manager_caches(void) { + kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7281112cd1c8..8d5e6e0dd840 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -39,10 +39,16 @@ struct node_info { unsigned char version; /* version of the node */ }; +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ +}; + struct nat_entry { struct list_head list; /* for clean or dirty nat list */ - bool checkpointed; /* whether it is checkpointed or not */ - bool fsync_done; /* whether the latest node has fsync mark */ + unsigned char flag; /* for node information bits */ struct node_info ni; /* in-memory node information */ }; @@ -55,18 +61,32 @@ struct nat_entry { #define nat_get_version(nat) (nat->ni.version) #define nat_set_version(nat, v) (nat->ni.version = v) -#define __set_nat_cache_dirty(nm_i, ne) \ - do { \ - ne->checkpointed = false; \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ - } while (0) -#define __clear_nat_cache_dirty(nm_i, ne) \ - do { \ - ne->checkpointed = true; \ - list_move_tail(&ne->list, &nm_i->nat_entries); \ - } while (0) #define inc_node_version(version) (++version) +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + unsigned char mask = 0x01 << type; + if (set) + ne->flag |= mask; + else + ne->flag &= ~mask; +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + unsigned char mask = 0x01 << type; + return ne->flag & mask; +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} + static inline void node_info_from_raw_nat(struct node_info *ni, struct f2fs_nat_entry *raw_ne) { @@ -89,6 +109,13 @@ enum mem_type { DIRTY_DENTS /* indicates dirty dentry pages */ }; +struct nat_entry_set { + struct list_head set_list; /* link with other nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t set; /* set number*/ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + /* * For free nid mangement */ @@ -103,18 +130,19 @@ struct free_nid { int state; /* in use or not: NID_NEW or NID_ALLOC */ }; -static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - if (nm_i->fcnt <= 0) - return -1; spin_lock(&nm_i->free_nid_list_lock); + if (nm_i->fcnt <= 0) { + spin_unlock(&nm_i->free_nid_list_lock); + return; + } fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->free_nid_list_lock); - return 0; } /* @@ -190,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src) static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); rn->footer.cp_ver = ckpt->checkpoint_ver; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a112368a4a86..ebd013225788 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -14,6 +14,37 @@ #include "node.h" #include "segment.h" +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) @@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct page *ipage, struct inode *inode) +static int recover_dentry(struct inode *inode, struct page *ipage) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); @@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode) } retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) + if (de && inode->i_ino == le32_to_cpu(de->ino)) { + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); goto out_unmap_put; + } if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { @@ -73,7 +106,7 @@ retry: err = -EEXIST; goto out_unmap_put; } - err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_unmap_put; @@ -108,35 +141,28 @@ out: return err; } -static int recover_inode(struct inode *inode, struct page *node_page) +static void recover_inode(struct inode *inode, struct page *page) { - struct f2fs_inode *raw_inode = F2FS_INODE(node_page); - - if (!IS_INODE(node_page)) - return 0; - - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - i_size_write(inode, le64_to_cpu(raw_inode->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + struct f2fs_inode *raw = F2FS_INODE(page); - if (is_dent_dnode(node_page)) - return recover_dentry(node_page, inode); + inode->i_mode = le16_to_cpu(raw->i_mode); + i_size_write(inode, le64_to_cpu(raw->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(node_page), raw_inode->i_name); - return 0; + ino_of_node(page), F2FS_INODE(page)->i_name); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct page *page; + struct page *page = NULL; block_t blkaddr; int err = 0; @@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* read node page */ - page = alloc_page(GFP_F2FS_ZERO); - if (!page) - return -ENOMEM; - lock_page(page); - while (1) { struct fsync_inode_entry *entry; - err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); - if (err) - return err; + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + return 0; - lock_page(page); + page = get_meta_page_ra(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) } /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); if (!entry) { err = -ENOMEM; break; } - + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); + if (err == -ENOENT) + goto next; break; } list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - err = recover_inode(entry->inode, page); - if (err && err != -ENOENT) - break; + if (IS_INODE(page)) { + entry->last_inode = blkaddr; + if (is_dent_dnode(page)) + entry->last_dentry = blkaddr; + } next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); } - - unlock_page(page); - __free_pages(page, 0); - + f2fs_put_page(page, 1); return err; } @@ -277,16 +301,30 @@ got_it: ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); - /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (ino != dn->inode->i_ino) { + /* Deallocate previous index in the node page */ + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } else { + inode = dn->inode; + } bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + le16_to_cpu(sum.ofs_in_node); - truncate_hole(inode, bidx, bidx + 1); - iput(inode); + if (ino != dn->inode->i_ino) { + truncate_hole(inode, bidx, bidx + 1); + iput(inode); + } else { + struct dnode_of_data tdn; + set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0); + if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + return 0; + if (tdn.data_blkaddr != NULL_ADDR) + truncate_data_blocks_range(&tdn, 1); + f2fs_put_page(tdn.node_page, 1); + } return 0; } @@ -300,12 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; - if (recover_inline_data(inode, page)) + /* step 1: recover xattr */ + if (IS_INODE(page)) { + recover_inline_xattr(inode, page); + } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + recover_xattr_data(inode, page, blkaddr); goto out; + } - if (recover_xattr_data(inode, page, blkaddr)) + /* step 2: recover inline data */ + if (recover_inline_data(inode, page)) goto out; + /* step 3: recover data indices */ start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); @@ -322,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); - f2fs_bug_on(ni.ino != ino_of_node(page)); - f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); for (; start < end; start++) { block_t src, dest; @@ -335,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); /* We should not get -ENOSPC */ - f2fs_bug_on(err); + f2fs_bug_on(sbi, err); } /* Check the previous node page having this index */ @@ -362,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); set_page_dirty(dn.node_page); - - recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); err: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); @@ -379,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi, { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct page *page; + struct page *page = NULL; int err = 0; block_t blkaddr; @@ -387,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi, curseg = CURSEG_I(sbi, type); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* read node page */ - page = alloc_page(GFP_F2FS_ZERO); - if (!page) - return -ENOMEM; - - lock_page(page); - while (1) { struct fsync_inode_entry *entry; - err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); - if (err) - return err; + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + break; - lock_page(page); + page = get_meta_page_ra(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (cp_ver != cpver_of_node(page)) { + f2fs_put_page(page, 1); break; + } entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (entry->last_inode == blkaddr) + recover_inode(entry->inode, page); + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + } err = do_recover_data(sbi, entry->inode, page, blkaddr); - if (err) + if (err) { + f2fs_put_page(page, 1); break; + } if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -422,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); } - - unlock_page(page); - __free_pages(page, 0); - if (!err) allocate_new_segments(sbi); return err; @@ -434,7 +483,9 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + block_t blkaddr; int err; bool need_writecp = false; @@ -447,6 +498,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ sbi->por_doing = true; + + /* prevent checkpoint */ + mutex_lock(&sbi->cp_mutex); + + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -458,12 +515,38 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #2: recover data */ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - f2fs_bug_on(!list_empty(&inode_list)); + if (!err) + f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); + + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + sbi->por_doing = false; - if (!err && need_writecp) - write_checkpoint(sbi, false); + if (err) { + discard_next_dnode(sbi, blkaddr); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); + } else if (need_writecp) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; + mutex_unlock(&sbi->cp_mutex); + write_checkpoint(sbi, &cpc); + } else { + mutex_unlock(&sbi->cp_mutex); + } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d04613df710a..923cb76fdc46 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -25,6 +25,8 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *inmem_entry_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) } /* - * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * Example: * LSB <--> MSB @@ -172,6 +174,60 @@ found_middle: return result + __reverse_ffz(tmp); } +void register_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *new; + + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + + /* add atomic page indices to the list */ + new->page = page; + INIT_LIST_HEAD(&new->list); + + /* increase reference count with clean state */ + mutex_lock(&fi->inmem_lock); + get_page(page); + list_add_tail(&new->list, &fi->inmem_pages); + mutex_unlock(&fi->inmem_lock); +} + +void commit_inmem_pages(struct inode *inode, bool abort) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *cur, *tmp; + bool submit_bio = false; + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); + if (!abort && cur->page->mapping == inode->i_mapping) { + f2fs_wait_on_page_writeback(cur->page, DATA); + if (clear_page_dirty_for_io(cur->page)) + inode_dec_dirty_pages(inode); + do_write_data_page(cur->page, &fio); + submit_bio = true; + } + f2fs_put_page(cur->page, 1); + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + } + if (submit_bio) + f2fs_submit_merged_bio(sbi, DATA, WRITE); + mutex_unlock(&fi->inmem_lock); + + filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX); + f2fs_unlock_op(sbi); +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. @@ -205,24 +261,20 @@ repeat: if (kthread_should_stop()) return 0; - spin_lock(&fcc->issue_lock); - if (fcc->issue_list) { - fcc->dispatch_list = fcc->issue_list; - fcc->issue_list = fcc->issue_tail = NULL; - } - spin_unlock(&fcc->issue_lock); - - if (fcc->dispatch_list) { + if (!llist_empty(&fcc->issue_list)) { struct bio *bio = bio_alloc(GFP_NOIO, 0); struct flush_cmd *cmd, *next; int ret; + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); - for (cmd = fcc->dispatch_list; cmd; cmd = next) { + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { cmd->ret = ret; - next = cmd->next; complete(&cmd->wait); } bio_put(bio); @@ -230,7 +282,7 @@ repeat: } wait_event_interruptible(*q, - kthread_should_stop() || fcc->issue_list); + kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } @@ -239,19 +291,18 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; struct flush_cmd cmd; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + + if (test_opt(sbi, NOBARRIER)) + return 0; + if (!test_opt(sbi, FLUSH_MERGE)) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); init_completion(&cmd.wait); - cmd.next = NULL; - spin_lock(&fcc->issue_lock); - if (fcc->issue_list) - fcc->issue_tail->next = &cmd; - else - fcc->issue_list = &cmd; - fcc->issue_tail = &cmd; - spin_unlock(&fcc->issue_lock); + llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); @@ -270,15 +321,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); - sbi->sm_info->cmd_control_info = fcc; + init_llist_head(&fcc->issue_list); + SM_I(sbi)->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; return err; } @@ -287,13 +338,12 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = - sbi->sm_info->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; if (fcc && fcc->f2fs_issue_flush) kthread_stop(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -312,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; } @@ -371,17 +425,14 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); - sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + sector_t start = SECTOR_FROM_BLOCK(blkstart); + sector_t len = SECTOR_FROM_BLOCK(blklen); trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi) +void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - if (f2fs_issue_discard(sbi, blkaddr, 1)) { struct page *page = grab_meta_page(sbi, blkaddr); /* zero-filled page */ @@ -390,22 +441,48 @@ void discard_next_dnode(struct f2fs_sb_info *sbi) } } -static void add_discard_addrs(struct f2fs_sb_info *sbi, - unsigned int segno, struct seg_entry *se) +static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct list_head *head = &SM_I(sbi)->discard_list; struct discard_entry *new; int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long dmap[entries]; unsigned int start = 0, end = -1; + bool force = (cpc->reason == CP_DISCARD); int i; - if (!test_opt(sbi, DISCARD)) + if (!force && !test_opt(sbi, DISCARD)) return; + if (force && !se->valid_blocks) { + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + /* + * if this segment is registered in the prefree list, then + * we should skip adding a discard candidate, and let the + * checkpoint do that later. + */ + mutex_lock(&dirty_i->seglist_lock); + if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { + mutex_unlock(&dirty_i->seglist_lock); + cpc->trimmed += sbi->blocks_per_seg; + return; + } + mutex_unlock(&dirty_i->seglist_lock); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, cpc->trim_start); + new->len = sbi->blocks_per_seg; + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += sbi->blocks_per_seg; + cpc->trimmed += sbi->blocks_per_seg; + return; + } + /* zero block will be discarded through the prefree list */ if (!se->valid_blocks || se->valid_blocks == max_blocks) return; @@ -414,40 +491,50 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, for (i = 0; i < entries; i++) dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (end - start < cpc->trim_minlen) + continue; + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, segno) + start; + new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; new->len = end - start; + cpc->trimmed += end - start; list_add_tail(&new->list, head); SM_I(sbi)->nr_discards += end - start; } } +void release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) { + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); + } +} + /* * Should call clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; - unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned int segno; mutex_lock(&dirty_i->seglist_lock); - while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) - break; + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno); - } mutex_unlock(&dirty_i->seglist_lock); } @@ -457,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; - unsigned int total_segs = TOTAL_SEGS(sbi); unsigned int start = 0, end = -1; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - start = find_next_bit(prefree_map, total_segs, end + 1); - if (start >= total_segs) + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) break; - end = find_next_zero_bit(prefree_map, total_segs, start + 1); + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); for (i = start; i < end; i++) clear_bit(i, prefree_map); @@ -491,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) } } -static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; + return false; + } + + return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, @@ -519,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; @@ -529,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { if (f2fs_set_bit(offset, se->cur_valid_map)) - BUG(); + f2fs_bug_on(sbi, 1); } else { if (!f2fs_clear_bit(offset, se->cur_valid_map)) - BUG(); + f2fs_bug_on(sbi, 1); } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; @@ -561,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); - f2fs_bug_on(addr == NULL_ADDR); + f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR) return; @@ -637,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); - if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) return !test_bit(segno, free_i->free_segmap); return 0; } @@ -651,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi, { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; - unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -663,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - TOTAL_SEGS(sbi), *newseg + 1); + MAIN_SEGS(sbi), *newseg + 1); if (segno - *newseg < sbi->segs_per_sec - (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); - if (secno >= TOTAL_SECS(sbi)) { + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - f2fs_bug_on(secno >= TOTAL_SECS(sbi)); + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -689,8 +781,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } secno = left_start; @@ -729,7 +821,7 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - f2fs_bug_on(test_bit(segno, free_i->free_segmap)); + f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; write_unlock(&free_i->segmap_lock); @@ -811,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, } /* - * This function always allocates a used segment (from dirty seglist) by SSR + * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) @@ -901,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = range->start >> sbi->log_blocksize; + __u64 end = start + (range->len >> sbi->log_blocksize) - 1; + unsigned int start_segno, end_segno; + struct cp_control cpc; + + if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) || + range->len < sbi->blocksize) + return -EINVAL; + + if (end <= MAIN_BLKADDR(sbi)) + goto out; + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + cpc.reason = CP_DISCARD; + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + cpc.trim_minlen = range->minlen >> sbi->log_blocksize; + cpc.trimmed = 0; + + /* do checkpoint to issue discard commands safely */ + write_checkpoint(sbi, &cpc); +out: + range->len = cpc.trimmed << sbi->log_blocksize; + return 0; +} + static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -956,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) static int __get_segment_type(struct page *page, enum page_type p_type) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - switch (sbi->active_logs) { + switch (F2FS_P_SB(page)->active_logs) { case 2: return __get_segment_type_2(page, p_type); case 4: return __get_segment_type_4(page, p_type); } /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); + f2fs_bug_on(F2FS_P_SB(page), + F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); return __get_segment_type_6(page, p_type); } @@ -974,14 +1097,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; - unsigned int old_cursegno; curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - old_cursegno = curseg->segno; /* * __add_sum_entry should be resided under the curseg_mutex @@ -1002,7 +1123,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); mutex_unlock(&sit_i->sentry_lock); @@ -1047,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page, void write_data_page(struct page *page, struct dnode_of_data *dn, block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); @@ -1061,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn, void rewrite_data_page(struct page *page, block_t old_blkaddr, struct f2fs_io_info *fio) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); + f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -1109,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } -void rewrite_node_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) -{ - struct sit_info *sit_i = SIT_I(sbi); - int type = CURSEG_WARM_NODE; - struct curseg_info *curseg; - unsigned int segno, old_cursegno; - block_t next_blkaddr = next_blkaddr_of_node(page); - unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); - struct f2fs_io_info fio = { - .type = NODE, - .rw = WRITE_SYNC, - }; - - curseg = CURSEG_I(sbi, type); - - mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); - - segno = GET_SEGNO(sbi, new_blkaddr); - old_cursegno = curseg->segno; - - /* change the current segment */ - if (segno != curseg->segno) { - curseg->next_segno = segno; - change_curseg(sbi, type, true); - } - curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - __add_sum_entry(sbi, type, sum); - - /* change the current log to the next block addr in advance */ - if (next_segno != segno) { - curseg->next_segno = next_segno; - change_curseg(sbi, type, true); - } - curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); - - /* rewrite node page */ - set_page_writeback(page); - f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - - mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); -} - static inline bool is_merged_page(struct f2fs_sb_info *sbi, struct page *page, enum page_type type) { @@ -1185,8 +1254,9 @@ out: void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (is_merged_page(sbi, page, type)) f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); @@ -1455,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); + unsigned int offset = SIT_BLOCK_OFFSET(segno); block_t blk_addr = sit_i->sit_base_addr + offset; check_seg_range(sbi, segno); @@ -1481,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, /* get current sit block page without lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(PageDirty(src_page)); + f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1495,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, return dst_page; } -static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; +} + +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&ses->set_list, &next->set_list); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; int i; - /* - * If the journal area in the current summary is full of sit entries, - * all the sit entries will be flushed. Otherwise the sit entries - * are not able to replace with newly hot sit entries. - */ - if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { - unsigned int segno; - segno = le32_to_cpu(segno_in_journal(sum, i)); - __mark_sit_entry_dirty(sbi, segno); - } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return true; + for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(sum, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - return false; + update_sits_in_cursum(sum, -sits_in_cursum(sum)); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi) +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned long nsegs = TOTAL_SEGS(sbi); - struct page *page = NULL; - struct f2fs_sit_block *raw_sit = NULL; - unsigned int start = 0, end = 0; - unsigned int segno = -1; - bool flushed; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = true; + struct seg_entry *se; mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); /* - * "flushed" indicates whether sit entries in journal are flushed - * to the SIT area or not. + * add and account sit entries of dirty bitmap in sit entry + * set temporarily + */ + add_sits_in_set(sbi); + + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. */ - flushed = flush_sits_in_journal(sbi); + if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + remove_sits_in_journal(sbi); - while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int sit_offset, offset; + if (!sit_i->dirty_sentries) + goto out; - sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct page *page; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (!to_journal) { + page = get_next_sit_page(sbi, start_segno); + raw_sit = page_address(page); + } - /* add discard candidates */ - if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) - add_discard_addrs(sbi, segno, se); + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; - if (flushed) - goto to_sit_page; + se = get_seg_entry(sbi, segno); - offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); - if (offset >= 0) { - segno_in_journal(sum, offset) = cpu_to_le32(segno); - seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); - goto flush_done; - } -to_sit_page: - if (!page || (start > segno) || (segno > end)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc); } - start = START_SEGNO(sit_i, segno); - end = start + SIT_ENTRY_PER_BLOCK - 1; + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(sum, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(sum, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + } - /* read sit block that will be updated */ - page = get_next_sit_page(sbi, start); - raw_sit = page_address(page); + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; } - /* udpate entry in SIT block */ - seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); -flush_done: - __clear_bit(segno, bitmap); - sit_i->dirty_sentries--; + if (!to_journal) + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); + } + + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason == CP_DISCARD) { + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - /* writeout last modified SIT block */ - f2fs_put_page(page, 1); - set_prefree_as_free_segments(sbi); } @@ -1609,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); if (!sit_i->sentries) return -ENOMEM; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map @@ -1629,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * + sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1664,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) static int build_free_segmap(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_info = SM_I(sbi); struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; @@ -1675,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1690,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ - free_i->start_segno = - (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr); + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; rwlock_init(&free_i->segmap_lock); @@ -1703,7 +1862,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); if (!array) return -ENOMEM; @@ -1728,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int nrpages = MAX_BIO_BLOCKS(sbi); do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); @@ -1736,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; - for (; start < end && start < TOTAL_SEGS(sbi); start++) { + for (; start < end && start < MAIN_SEGS(sbi); start++) { struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; struct f2fs_sit_entry sit; @@ -1774,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) unsigned int start; int type; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); @@ -1791,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); + unsigned int segno = 0, offset = 0; unsigned short valid_blocks; while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, total_segs, offset); - if (segno >= total_segs) + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, 0); - if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) + if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + continue; + if (valid_blocks > sbi->blocks_per_seg) { + f2fs_bug_on(sbi, 1); continue; + } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); @@ -1812,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) @@ -1833,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); @@ -1857,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = LLONG_MAX; - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { unsigned int i; unsigned long long mtime = 0; @@ -1895,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; + INIT_LIST_HEAD(&sm_info->sit_entry_set); + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) @@ -1997,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) return; if (sit_i->sentries) { - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); kfree(sit_i->sentries[start].ckpt_valid_map); } @@ -2031,11 +2197,30 @@ int __init create_segment_manager_caches(void) discard_entry_slab = f2fs_kmem_cache_create("discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) - return -ENOMEM; + goto fail; + + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sizeof(struct nat_entry_set)); + if (!sit_entry_set_slab) + goto destory_discard_entry; + + inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + sizeof(struct inmem_pages)); + if (!inmem_entry_slab) + goto destroy_sit_entry_set; return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destory_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; } void destroy_segment_manager_caches(void) { + kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7091204680f4..2495bec1c621 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -45,16 +45,26 @@ (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ sbi->segs_per_sec)) \ -#define START_BLOCK(sbi, segno) \ - (SM_I(sbi)->seg0_blkaddr + \ +#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) +#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) (sbi->total_sections) + +#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ + sbi->log_blocks_per_seg)) + +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) -#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) - -#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \ - ((blk_addr) - SM_I(sbi)->seg0_blkaddr) +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ @@ -77,23 +87,21 @@ #define SIT_ENTRY_OFFSET(sit_i, segno) \ (segno % sit_i->sents_per_block) -#define SIT_BLOCK_OFFSET(sit_i, segno) \ +#define SIT_BLOCK_OFFSET(segno) \ (segno / SIT_ENTRY_PER_BLOCK) -#define START_SEGNO(sit_i, segno) \ - (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ - ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) + ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) -#define TOTAL_SECS(sbi) (sbi->total_sections) -#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) -#define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> (sbi)->log_sectors_per_block) -#define MAX_BIO_BLOCKS(max_hw_blocks) \ - (min((int)max_hw_blocks, BIO_MAX_PAGES)) +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) +#define MAX_BIO_BLOCKS(sbi) \ + ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -167,6 +175,11 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; +struct inmem_pages { + struct list_head list; + struct page *page; +}; + struct sit_info { const struct segment_allocation *s_ops; @@ -237,6 +250,12 @@ struct curseg_info { unsigned int next_segno; /* preallocated segment */ }; +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ +}; + /* * inline functions */ @@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) clear_bit(segno, free_i->free_segmap); free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); + next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno); if (next >= start_segno + sbi->segs_per_sec) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; @@ -347,8 +366,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), - start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; @@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return (prefree_segments(sbi) / sbi->segs_per_sec) - + free_sections(sbi) < overprovision_sections(sbi); + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + return free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi) + 1); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -466,44 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi) * F2FS_IPU_UTIL - if FS utilization is over threashold, * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ #define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, - F2FS_IPU_DISABLE, + F2FS_IPU_FSYNC, }; static inline bool need_inplace_update(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; - switch (SM_I(sbi)->ipu_policy) { - case F2FS_IPU_FORCE: + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; - case F2FS_IPU_SSR: - if (need_SSR(sbi)) - return true; - break; - case F2FS_IPU_UTIL: - if (utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - break; - case F2FS_IPU_SSR_UTIL: - if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - break; - case F2FS_IPU_DISABLE: - break; - } + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + return true; + return false; } @@ -530,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) #ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { - unsigned int end_segno = SM_I(sbi)->segment_count - 1; - BUG_ON(segno > end_segno); + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - struct f2fs_sm_info *sm_info = SM_I(sbi); - block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; - block_t start_addr = sm_info->seg0_blkaddr; - block_t end_addr = start_addr + total_blks - 1; - BUG_ON(blk_addr < start_addr); - BUG_ON(blk_addr > end_addr); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); + BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); } /* - * Summary block is always treated as invalid block + * Summary block is always treated as an invalid block */ static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { - struct f2fs_sm_info *sm_info = SM_I(sbi); - unsigned int end_segno = sm_info->segment_count - 1; bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; @@ -560,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); /* check boundary of a given segment number */ - BUG_ON(segno > end_segno); + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); /* check bitmap with valid block count */ do { @@ -579,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); } #else -#define check_seg_range(sbi, segno) -#define verify_block_addr(sbi, blk_addr) -#define check_block_count(sbi, segno, raw_sit) +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + if (segno > TOTAL_SEGS(sbi) - 1) + sbi->need_fsck = true; +} + +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ + if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) + sbi->need_fsck = true; +} + +/* + * Summary block is always treated as an invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + /* check segment usage */ + if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) + sbi->need_fsck = true; + + /* check boundary of a given segment number */ + if (segno > TOTAL_SEGS(sbi) - 1) + sbi->need_fsck = true; +} #endif static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; check_seg_range(sbi, start); @@ -615,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { - unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int block_off = SIT_BLOCK_OFFSET(start); if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) f2fs_clear_bit(block_off, sit_i->sit_bitmap); @@ -662,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) { struct block_device *bdev = sbi->sb->s_bdev; struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); + return SECTOR_TO_BLOCK(queue_max_sectors(q)); } /* @@ -679,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + return MAX_BIO_BLOCKS(sbi); else return 0; } @@ -702,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, else if (type == NODE) desired = 3 * max_hw_blocks(sbi); else - desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + desired = MAX_BIO_BLOCKS(sbi); wbc->nr_to_write = desired; return desired - nr_to_write; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f96d9372ade..41d6f700f4ee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -52,6 +52,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_flush_merge, + Opt_nobarrier, Opt_err, }; @@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_flush_merge, "flush_merge"}, + {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL}, }; @@ -188,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -202,6 +205,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_small_discards), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -339,6 +343,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -361,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_dents, 0); + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->inmem_pages); + mutex_init(&fi->inmem_lock); set_inode_flag(fi, FI_NEW_INODE); @@ -427,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb) stop_gc_thread(sbi); /* We don't need to do checkpoint when it's clean */ - if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) - write_checkpoint(sbi, true); + if (sbi->s_dirty) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + write_checkpoint(sbi, &cpc); + } + + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ + release_dirty_inode(sbi); + release_discard_addrs(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -452,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); - if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) - return 0; - if (sync) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } else { f2fs_balance_fs(sbi); @@ -500,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = sbi->total_node_count; - buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); + buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + buf->f_ffree = buf->f_files - valid_inode_count(sbi); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; @@ -544,6 +564,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_data"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -606,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + sbi->mount_opt.opt = 0; + sbi->active_logs = NR_CURSEG_TYPE; + /* parse mount options */ err = parse_options(sb, data); if (err) @@ -615,7 +640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; /* @@ -642,8 +667,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { destroy_flush_cmd_control(sbi); - } else if (test_opt(sbi, FLUSH_MERGE) && - !sbi->sm_info->cmd_control_info) { + } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -657,7 +681,7 @@ restore_gc: if (need_restart_gc) { if (start_gc_thread(sbi)) f2fs_msg(sbi->sb, KERN_WARNING, - "background gc thread is stop"); + "background gc thread has stopped"); } else if (need_stop_gc) { stop_gc_thread(sbi); } @@ -777,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } - if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); + /* Currently, support 512/1024/2048/4096 bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); return 1; } - if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); return 1; } return 0; @@ -806,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; - if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } @@ -840,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->need_fsck = false; } /* @@ -893,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; + bool retry = true; int i; +try_onemore: /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) @@ -947,7 +982,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->node_write); + init_rwsem(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); @@ -997,7 +1032,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - init_orphan_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); @@ -1034,8 +1069,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); err = -EINVAL; - goto free_root_inode; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -1070,19 +1106,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_proc; + if (!retry) + sbi->need_fsck = true; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { err = recover_fsync_data(sbi); - if (err) + if (err) { f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%ld", err); + goto free_kobj; + } } /* * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) @@ -1116,6 +1157,13 @@ free_sb_buf: brelse(raw_super_buf); free_sbi: kfree(sbi); + + /* give only one another chance */ + if (retry) { + retry = 0; + shrink_dcache_sb(sb); + goto try_onemore; + } return err; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8bea941ee309..deca8728117b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, static void *read_all_xattrs(struct inode *inode, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; @@ -325,7 +325,7 @@ fail: static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = 0; void *xattr_addr; struct page *xpage; @@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); } - f2fs_bug_on(new_nid); + f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE); } else { struct dnode_of_data dn; @@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int free; /* * If value is NULL, it is remove operation. - * In case of update operation, we caculate free. + * In case of update operation, we calculate free. */ free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) @@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; /* this case is only from init_inode_metadata */ diff --git a/fs/fcntl.c b/fs/fcntl.c index 72c82f69b01b..99d440a4a6ba 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/shmem_fs.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -97,26 +98,19 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, write_unlock_irq(&filp->f_owner.lock); } -int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, +void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - int err; - - err = security_file_set_fowner(filp); - if (err) - return err; - + security_file_set_fowner(filp); f_modown(filp, pid, type, force); - return 0; } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +void f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; struct pid *pid; int who = arg; - int result; type = PIDTYPE_PID; if (who < 0) { type = PIDTYPE_PGID; @@ -124,9 +118,8 @@ int f_setown(struct file *filp, unsigned long arg, int force) } rcu_read_lock(); pid = find_vpid(who); - result = __f_setown(filp, pid, type, force); + __f_setown(filp, pid, type, force); rcu_read_unlock(); - return result; } EXPORT_SYMBOL(f_setown); @@ -180,7 +173,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) if (owner.pid && !pid) ret = -ESRCH; else - ret = __f_setown(filp, pid, type, 1); + __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; @@ -301,7 +294,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + f_setown(filp, arg, 1); + err = 0; break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -336,6 +330,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, arg); break; + case F_ADD_SEALS: + case F_GET_SEALS: + err = shmem_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..0bab12b20460 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be568b7311d6..ef9bef118342 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); + __wait_on_bit(wqh, &wq, bit_wait, + TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } diff --git a/fs/fs_pin.c b/fs/fs_pin.c new file mode 100644 index 000000000000..9368236ca100 --- /dev/null +++ b/fs/fs_pin.c @@ -0,0 +1,78 @@ +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/fs_pin.h> +#include "internal.h" +#include "mount.h" + +static void pin_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct fs_pin, rcu)); +} + +static DEFINE_SPINLOCK(pin_lock); + +void pin_put(struct fs_pin *p) +{ + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, pin_free_rcu); +} + +void pin_remove(struct fs_pin *pin) +{ + spin_lock(&pin_lock); + hlist_del(&pin->m_list); + hlist_del(&pin->s_list); + spin_unlock(&pin_lock); +} + +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + spin_lock(&pin_lock); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&pin_lock); +} + +void mnt_pin_kill(struct mount *m) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(m->mnt_pins.first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, m_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} + +void sb_pin_kill(struct super_block *sb) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(sb->s_pins.first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, s_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index aec01be91b0a..89acec742e0b 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, _enter("%p", cookie); wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; @@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) if (!fscache_defer_lookup) { _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; @@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) _enter("%p", cookie); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - fscache_wait_bit_interruptible, TASK_UNINTERRUPTIBLE); _leave(""); @@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) } wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index bc6c08fcfddd..7872a62ef30c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void) return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); } -extern int fscache_wait_bit(void *); -extern int fscache_wait_bit_interruptible(void *); extern int fscache_wait_atomic_t(atomic_t *); /* diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 63f868e869b9..b39d487ccfb0 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -struct ctl_table fscache_sysctls[] = { +static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = { {} }; -struct ctl_table fscache_sysctls_root[] = { +static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, @@ -197,24 +197,6 @@ static void __exit fscache_exit(void) module_exit(fscache_exit); /* - * wait_on_bit() sleep function for uninterruptible waiting - */ -int fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -int fscache_wait_bit_interruptible(void *flags) -{ - schedule(); - return signal_pending(current); -} - -/* * wait_on_atomic_t() sleep function for uninterruptible waiting */ int fscache_wait_atomic_t(atomic_t *p) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index d3b4539f1651..da032daf0e0d 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -982,6 +982,7 @@ nomem: submit_op_failed: clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); spin_unlock(&cookie->lock); + fscache_unuse_cookie(object); kfree(op); _leave(" [EIO]"); return transit_to(KILL_OBJECT); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ed70714503fa..de33b3fccca6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa EXPORT_SYMBOL(__fscache_wait_on_page_write); /* + * wait for a page to finish being written to the cache. Put a timeout here + * since we might be called recursively via parent fs. + */ +static +bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), + HZ); +} + +/* * decide whether a page can be released, possibly by cancelling a store to it * - we're allowed to sleep if __GFP_WAIT is flagged */ @@ -115,7 +128,10 @@ page_busy: } fscache_stat(&fscache_n_store_vmscan_wait); - __fscache_wait_on_page_write(cookie, page); + if (!release_page_wait_timeout(cookie, page)) + _debug("fscache writeout timeout page: %p{%lx}", + page, page->index); + gfp &= ~__GFP_WAIT; goto try_again; } @@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; - bool wake_cookie; + bool wake_cookie = false; _enter("%p", cookie); @@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) - goto nobufs; + goto nobufs_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_attr_changed_ok); fscache_put_operation(op); _leave(" = 0"); return 0; -nobufs: +nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); +nobufs: spin_unlock(&cookie->lock); kfree(op); if (wake_cookie) @@ -298,7 +315,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); _leave(" = -ERESTARTSYS"); @@ -342,7 +358,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, if (stat_op_waits) fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { ret = fscache_cancel_op(op, do_cancel); if (ret == 0) @@ -351,7 +366,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, /* it's been removed from the pending queue by another party, * so we should get to run shortly */ wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0c6048247a34..de1d84af9f7c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename2(olddir, oldent, newdir, newent, 0); -} - static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = { .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, - .rename = fuse_rename, .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 40ac2628ddcf..caa8d95b24e8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1303,10 +1303,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, while (nbytes < *nbytesp && req->num_pages < req->max_pages) { unsigned npages; size_t start; - unsigned n = req->max_pages - req->num_pages; ssize_t ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], - n * PAGE_SIZE, &start); + *nbytesp - nbytes, + req->max_pages - req->num_pages, + &start); if (ret < 0) return ret; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e6ee5b6e8d99..f0b945ab853e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp) * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob) { const __be64 *end = (start + len); const __be64 *first = ptr; @@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct buffer_head *bh_map, struct metapath *mp, const unsigned int sheight, const unsigned int height, - const unsigned int maxlen) + const size_t maxlen) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, } else { /* Need to allocate indirect blocks */ ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; - dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + dblks = min(maxlen, (size_t)(ptrs_per_blk - + mp->mp_list[end_of_metadata])); if (height == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = height - sheight; @@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int bsize = sdp->sd_sb.sb_bsize; - const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const size_t maxlen = bh_map->b_size >> inode->i_blkbits; const u64 *arr = sdp->sd_heightsize; __be64 *ptr; u64 size; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1a349f9a9685..5d4261ff5d23 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, } if (IS_ERR(dent)) return PTR_ERR(dent); - da->bh = bh; - da->dent = dent; + + if (da->save_loc) { + da->bh = bh; + da->dent = dent; + } else { + brelse(bh); + } return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 126c65dda028..e1b309c24dab 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -23,6 +23,7 @@ struct gfs2_diradd { unsigned nr_blocks; struct gfs2_dirent *dent; struct buffer_head *bh; + int save_loc; }; extern struct inode *gfs2_dir_search(struct inode *dir, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 26b3f952e6b1..80dd44dca028 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -26,6 +26,7 @@ #include <linux/dlm.h> #include <linux/dlm_plock.h> #include <linux/aio.h> +#include <linux/delay.h> #include "gfs2.h" #include "incore.h" @@ -913,26 +914,6 @@ out_uninit: #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** - * gfs2_setlease - acquire/release a file lease - * @file: the file pointer - * @arg: lease type - * @fl: file lock - * - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - * - * Locking: called under i_lock - * - * Returns: errno - */ - -static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) -{ - return -EINVAL; -} - -/** * gfs2_lock - acquire/release a posix lock on a file * @file: the file pointer * @cmd: either modify or retrieve lock state, possibly wait @@ -979,9 +960,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) unsigned int state; int flags; int error = 0; + int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -1001,7 +983,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) gfs2_holder_init(gl, state, flags, fl_gh); gfs2_glock_put(gl); } - error = gfs2_glock_nq(fl_gh); + for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { + error = gfs2_glock_nq(fl_gh); + if (error != GLR_TRYFAILED) + break; + fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_error = 0; + msleep(sleeptime); + } if (error) { gfs2_holder_uninit(fl_gh); if (error == GLR_TRYFAILED) @@ -1024,7 +1013,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); flock_lock_file_wait(file, fl); if (fl_gh->gh_gl) { - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); } mutex_unlock(&fp->f_fl_mutex); @@ -1069,7 +1058,7 @@ const struct file_operations gfs2_file_fops = { .flock = gfs2_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .setlease = gfs2_setlease, + .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ee4e04fe60fc..8f0c19d1d943 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gl; - gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_ip = _RET_IP_; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; @@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; - gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_ip = _RET_IP_; if (gh->gh_owner_pid) put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); @@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) } /** - * gfs2_glock_holder_wait - * @word: unused - * - * This function and gfs2_glock_demote_wait both show up in the WCHAN - * field. Thus I've separated these otherwise identical functions in - * order to be more informative to the user. - */ - -static int gfs2_glock_holder_wait(void *word) -{ - schedule(); - return 0; -} - -static int gfs2_glock_demote_wait(void *word) -{ - schedule(); - return 0; -} - -/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) unsigned long time1 = jiffies; might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ /* Lengthen the minimum hold time. */ gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + @@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 2ffc67dce87f..1cc0bba6313f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) * tr->alloced is not set since the transaction structure is * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); - tr.tr_ip = (unsigned long)__builtin_return_address(0); + tr.tr_ip = _RET_IP_; sb_start_intwrite(sdp->sd_vfs); if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { sb_end_intwrite(sdp->sd_vfs); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 67d310c9ada3..39e7e9959b74 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -262,6 +262,9 @@ struct gfs2_holder { unsigned long gh_ip; }; +/* Number of quota types we support */ +#define GFS2_MAXQUOTAS 2 + /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. @@ -282,8 +285,8 @@ struct gfs2_blkreserv { u64 rs_inum; /* Inode number for reservation */ /* ancillary quota stuff */ - struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; - struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; + struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; unsigned int rs_qa_qd_num; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e62e59477884..fcf42eadb69c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, int error, free_vfs_inode = 0; u32 aflags = 0; unsigned blocks = 1; - struct gfs2_diradd da = { .bh = NULL, }; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); error = PTR_ERR(d); - if (IS_ERR(d)) + if (IS_ERR(d)) { + inode = ERR_CAST(d); goto fail_gunlock; + } error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -670,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); + check_and_update_goal(dip); ip->i_goal = dip->i_goal; ip->i_diskflags = 0; ip->i_eattr = 0; @@ -840,8 +843,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, int error; inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (!inode) + if (inode == NULL) { + d_add(dentry, NULL); return NULL; + } if (IS_ERR(inode)) return ERR_CAST(inode); @@ -854,7 +859,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, d = d_splice_alias(inode, dentry); if (IS_ERR(d)) { - iput(inode); gfs2_glock_dq_uninit(&gh); return d; } @@ -896,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - struct gfs2_diradd da = { .bh = NULL, }; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; int error; if (S_ISDIR(inode->i_mode)) @@ -1334,7 +1338,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - struct gfs2_diradd da = { .nr_blocks = 0, }; + struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, }; unsigned int x; int error; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 4fafea1c9ecf..641383a9c1bb 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -936,12 +936,6 @@ fail: return error; } -static int dlm_recovery_wait(void *word) -{ - schedule(); - return 0; -} - static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -976,7 +970,7 @@ restart: fs_info(sdp, "control_first_done wait gen %u\n", start_gen); wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, - dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index bc564c0d6d16..d3eae244076e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } -static int gfs2_journalid_wait(void *word) -{ - if (signal_pending(current)) - return -EINTR; - schedule(); - return 0; -} - static int wait_on_journal(struct gfs2_sbd *sdp) { if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; - return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; } void gfs2_online_uevent(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 94555d4c5698..573bd3b758fa 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -591,12 +591,6 @@ done: wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -static int gfs2_recovery_wait(void *word) -{ - schedule(); - return 0; -} - int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; @@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) BUG_ON(!rv); if (wait) - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f4cb9c0d6bbd..7474c413ffd1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) return rgd; } +void check_and_update_goal(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL) + ip->i_goal = ip->i_no_addr; +} + void gfs2_free_clones(struct gfs2_rgrpd *rgd) { int x; @@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { + check_and_update_goal(ip); rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) @@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; - struct gfs2_bitmap *bi; + struct gfs2_bitmap *bi, *bi_prev = NULL; rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); if (!rbm.rgd) { @@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, return NULL; } + gfs2_rbm_from_block(&rbm, bstart); while (blen--) { - gfs2_rbm_from_block(&rbm, bstart); bi = rbm_bi(&rbm); - bstart++; - if (!bi->bi_clone) { - bi->bi_clone = kmalloc(bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + if (bi != bi_prev) { + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); + bi_prev = bi; } - gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); gfs2_setbit(&rbm, false, new_state); + gfs2_rbm_incr(&rbm); } return rbm.rgd; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 463ab2e95d1c..5d8f085f7ade 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1319b5c4ec68..a346f56c4c6d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -894,7 +888,7 @@ restart: continue; spin_unlock(&sdp->sd_jindex_spin); wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } spin_unlock(&sdp->sd_jindex_spin); @@ -1300,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) int val; if (is_ancestor(root, sdp->sd_master_dir)) - seq_printf(s, ",meta"); + seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); if (args->ar_locktable[0]) @@ -1308,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (args->ar_hostdata[0]) seq_printf(s, ",hostdata=%s", args->ar_hostdata); if (args->ar_spectator) - seq_printf(s, ",spectator"); + seq_puts(s, ",spectator"); if (args->ar_localflocks) - seq_printf(s, ",localflocks"); + seq_puts(s, ",localflocks"); if (args->ar_debug) - seq_printf(s, ",debug"); + seq_puts(s, ",debug"); if (args->ar_posix_acl) - seq_printf(s, ",acl"); + seq_puts(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { char *state; switch (args->ar_quota) { @@ -1334,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",quota=%s", state); } if (args->ar_suiddir) - seq_printf(s, ",suiddir"); + seq_puts(s, ",suiddir"); if (args->ar_data != GFS2_DATA_DEFAULT) { char *state; switch (args->ar_data) { @@ -1351,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",data=%s", state); } if (args->ar_discard) - seq_printf(s, ",discard"); + seq_puts(s, ",discard"); val = sdp->sd_tune.gt_logd_secs; if (val != 30) seq_printf(s, ",commit=%d", val); @@ -1382,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",errors=%s", state); } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - seq_printf(s, ",nobarrier"); + seq_puts(s, ",nobarrier"); if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) - seq_printf(s, ",demote_interface_used"); + seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) - seq_printf(s, ",rgrplvb"); + seq_puts(s, ",rgrplvb"); return 0; } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 0546ab4e28e8..42bfd3361979 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!tr) return -ENOMEM; - tr->tr_ip = (unsigned long)__builtin_return_address(0); + tr->tr_ip = _RET_IP_; tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 9c88da0e855a..4fcd40d6f308 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major, extern int link_file(const char *from, const char *to); extern int hostfs_do_readlink(char *file, char *buf, int size); extern int rename_file(char *from, char *to); +extern int rename2_file(char *from, char *to, unsigned int flags); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index bb529f3b7f2b..fd62cae0fdcb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { - char *from_name, *to_name; + char *old_name, *new_name; int err; - if ((from_name = dentry_name(from)) == NULL) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + old_name = dentry_name(old_dentry); + if (old_name == NULL) return -ENOMEM; - if ((to_name = dentry_name(to)) == NULL) { - __putname(from_name); + new_name = dentry_name(new_dentry); + if (new_name == NULL) { + __putname(old_name); return -ENOMEM; } - err = rename_file(from_name, to_name); - __putname(from_name); - __putname(to_name); + if (!flags) + err = rename_file(old_name, new_name); + else + err = rename2_file(old_name, new_name, flags); + + __putname(old_name); + __putname(new_name); return err; } @@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = { .mkdir = hostfs_mkdir, .rmdir = hostfs_rmdir, .mknod = hostfs_mknod, - .rename = hostfs_rename, + .rename2 = hostfs_rename2, .permission = hostfs_permission, .setattr = hostfs_setattr, }; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 67838f3aa20a..9765dab95cbd 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -14,6 +14,7 @@ #include <sys/time.h> #include <sys/types.h> #include <sys/vfs.h> +#include <sys/syscall.h> #include "hostfs.h" #include <utime.h> @@ -360,6 +361,33 @@ int rename_file(char *from, char *to) return 0; } +int rename2_file(char *from, char *to, unsigned int flags) +{ + int err; + +#ifndef SYS_renameat2 +# ifdef __x86_64__ +# define SYS_renameat2 316 +# endif +# ifdef __i386__ +# define SYS_renameat2 353 +# endif +#endif + +#ifdef SYS_renameat2 + err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags); + if (err < 0) { + if (errno != ENOSYS) + return -errno; + else + return -EINVAL; + } + return 0; +#else + return -EINVAL; +#endif +} + int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index f36fc010fccb..2923a7bd82ac 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) struct dnode *d1; struct quad_buffer_head qbh1; if (hpfs_sb(i->i_sb)->sb_chk) - if (up != i->i_ino) { - hpfs_error(i->i_sb, - "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", - dno, up, (unsigned long)i->i_ino); - return; - } + if (up != i->i_ino) { + hpfs_error(i->i_sb, + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + dno, up, + (unsigned long)i->i_ino); + return; + } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { d1->up = cpu_to_le32(up); d1->root_dnode = 1; @@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { - kfree(name2); - return NULL; + kfree(name2); + return NULL; } goto go_down; } diff --git a/fs/inode.c b/fs/inode.c index 6eecb7ff0b9a..26753ba7b6d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; @@ -1695,13 +1696,6 @@ int inode_needs_sync(struct inode *inode) } EXPORT_SYMBOL(inode_needs_sync); -int inode_wait(void *word) -{ - schedule(); - return 0; -} -EXPORT_SYMBOL(inode_wait); - /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its diff --git a/fs/internal.h b/fs/internal.h index 465742407466..b2623200107b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) #endif /* + * buffer.c + */ +extern void guard_bio_eod(int rw, struct bio *bio); + +/* * char_dev.c */ extern void __init chrdev_init(void); @@ -131,7 +136,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, /* * read_write.c */ -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* @@ -144,3 +148,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * pipe.c */ extern const struct file_operations pipefifo_fops; + +/* + * fs_pin.c + */ +extern void sb_pin_kill(struct super_block *sb); +extern void mnt_pin_kill(struct mount *m); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 592e5115a561..f311bf084015 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, "zisofs: zisofs_inflate returned" " %d, inode = %lu," " page idx = %d, bh idx = %d," - " avail_in = %d," - " avail_out = %d\n", + " avail_in = %ld," + " avail_out = %ld\n", zerr, inode->i_ino, curpage, curbh, stream.avail_in, stream.avail_out); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 4556ce1af5b0..5ddaf8625d3b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb) return; } -static int isofs_read_inode(struct inode *); +static int isofs_read_inode(struct inode *, int relocated); static int isofs_statfs (struct dentry *, struct kstatfs *); static struct kmem_cache *isofs_inode_cachep; @@ -1259,7 +1259,7 @@ out_toomany: goto out; } -static int isofs_read_inode(struct inode *inode) +static int isofs_read_inode(struct inode *inode, int relocated) { struct super_block *sb = inode->i_sb; struct isofs_sb_info *sbi = ISOFS_SB(sb); @@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode) */ if (!high_sierra) { - parse_rock_ridge_inode(de, inode); + parse_rock_ridge_inode(de, inode, relocated); /* if we want uid/gid set, override the rock ridge setting */ if (sbi->s_uid_set) inode->i_uid = sbi->s_uid; @@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data) * offset that point to the underlying meta-data for the inode. The * code below is otherwise similar to the iget() code in * include/linux/fs.h */ -struct inode *isofs_iget(struct super_block *sb, - unsigned long block, - unsigned long offset) +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated) { unsigned long hashval; struct inode *inode; @@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - ret = isofs_read_inode(inode); + ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); inode = ERR_PTR(ret); diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 99167238518d..0ac4c1f73fbd 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -107,7 +107,7 @@ extern int iso_date(char *, int); struct inode; /* To make gcc happy */ -extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *); +extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated); extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); @@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int extern struct buffer_head *isofs_bread(struct inode *, sector_t); extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); -extern struct inode *isofs_iget(struct super_block *sb, - unsigned long block, - unsigned long offset); +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated); + +static inline struct inode *isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 0); +} + +static inline struct inode *isofs_iget_reloc(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 1); +} /* Because the inode number is no longer relevant to finding the * underlying meta-data for an inode, we are free to choose a more diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index c0bf42472e40..f488bbae541a 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -288,12 +288,16 @@ eio: goto out; } +#define RR_REGARD_XA 1 +#define RR_RELOC_DE 2 + static int parse_rock_ridge_inode_internal(struct iso_directory_record *de, - struct inode *inode, int regard_xa) + struct inode *inode, int flags) { int symlink_len = 0; int cnt, sig; + unsigned int reloc_block; struct inode *reloc; struct rock_ridge *rr; int rootflag; @@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de, init_rock_state(&rs, inode); setup_rock_ridge(de, inode, &rs); - if (regard_xa) { + if (flags & RR_REGARD_XA) { rs.chr += 14; rs.len -= 14; if (rs.len < 0) @@ -485,12 +489,22 @@ repeat: "relocated directory\n"); goto out; case SIG('C', 'L'): - ISOFS_I(inode)->i_first_extent = - isonum_733(rr->u.CL.location); - reloc = - isofs_iget(inode->i_sb, - ISOFS_I(inode)->i_first_extent, - 0); + if (flags & RR_RELOC_DE) { + printk(KERN_ERR + "ISOFS: Recursive directory relocation " + "is not supported\n"); + goto eio; + } + reloc_block = isonum_733(rr->u.CL.location); + if (reloc_block == ISOFS_I(inode)->i_iget5_block && + ISOFS_I(inode)->i_iget5_offset == 0) { + printk(KERN_ERR + "ISOFS: Directory relocation points to " + "itself\n"); + goto eio; + } + ISOFS_I(inode)->i_first_extent = reloc_block; + reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0); if (IS_ERR(reloc)) { ret = PTR_ERR(reloc); goto out; @@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) return rpnt; } -int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) +int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode, + int relocated) { - int result = parse_rock_ridge_inode_internal(de, inode, 0); + int flags = relocated ? RR_RELOC_DE : 0; + int result = parse_rock_ridge_inode_internal(de, inode, flags); /* * if rockridge flag was reset and we didn't look for attributes @@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) */ if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { - result = parse_rock_ridge_inode_internal(de, inode, 14); + result = parse_rock_ridge_inode_internal(de, inode, + flags | RR_REGARD_XA); } return result; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6fac74349856..b73e0215baa7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) struct commit_header *h; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; h = (struct commit_header *)(bh->b_data); @@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) return checksum; } -static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j, struct jbd2_journal_block_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - @@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j, static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; seq = cpu_to_be32(sequence); @@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - /* We only have space to store the lower 16 bits of the crc32c. */ - tag->t_checksum = cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) LIST_HEAD(io_bufs); LIST_HEAD(log_bufs); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); /* @@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag_flag |= JBD2_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 67b8e303946c..19d74d86d99c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum == jbd2_superblock_csum(j, sb); @@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; sb->s_checksum = jbd2_superblock_csum(j, sb); @@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } /* Load the checksum driver */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal) } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; - /* Asking for checksumming v2 and v1? Only give them v2. */ - if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && + /* If enabling v2 checksums, turn on v3 instead */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; + } + + /* Asking for checksumming v3 and v1? Only give them v3. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; @@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; - /* If enabling v2 checksums, update superblock */ - if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + /* If enabling v3 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); @@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, /* If enabling v1 checksums, downgrade superblock */ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) sb->s_feature_incompat &= - ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | + JBD2_FEATURE_INCOMPAT_CSUM_V3); sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); @@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode) */ size_t journal_tag_bytes(journal_t *journal) { - journal_block_tag_t tag; - size_t x = 0; + size_t sz; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return sizeof(journal_block_tag3_t); + + sz = sizeof(journal_block_tag_t); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) - x += sizeof(tag.t_checksum); + sz += sizeof(__u16); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) - return x + JBD2_TAG_SIZE64; + return sz; else - return x + JBD2_TAG_SIZE32; + return sz - sizeof(__u32); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3b6bb19d60b1..9b329b55ffe3 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - @@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) size -= sizeof(struct jbd2_journal_block_tail); tagp = &bh->b_data[sizeof(journal_header_t)]; @@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal) return err; } -static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) +static inline unsigned long long read_tag_block(journal_t *journal, + journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; h = buf; @@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; seq = cpu_to_be32(sequence); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - return tag->t_checksum == cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return tag3->t_checksum == cpu_to_be32(csum32); + else + return tag->t_checksum == cpu_to_be16(csum32); } static int do_one_pass(journal_t *journal, @@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ int descr_csum_size = 0; + int block_error = 0; /* * First thing is to establish what we expect to find in the log @@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && @@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal, unsigned long long blocknr; J_ASSERT(obh != NULL); - blocknr = read_tag_block(tag_bytes, + blocknr = read_tag_block(journal, tag); /* If the block has been @@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal, "checksum recovering " "block %llu in log\n", blocknr); - continue; + block_error = 1; + goto skip_write; } /* Find a buffer for the new @@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal, success = -EIO; } } - + if (block_error && success == 0) + success = -EIO; return success; failed: @@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 198c9c10276d..d5e95a175c92 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -91,8 +91,8 @@ #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> -#endif #include <linux/log2.h> +#endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; @@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal, offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); /* Make sure we have a descriptor with space left for the record */ @@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) struct jbd2_journal_revoke_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6f0f590cc5a3..5f09370c90a8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -static int sleep_on_shadow_bh(void *word) -{ - io_schedule(); - return 0; -} - /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -906,8 +900,8 @@ repeat: if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - wait_on_bit(&bh->b_state, BH_Shadow, - sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Shadow, + TASK_UNINTERRUPTIBLE); goto repeat; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 009ec0b5993d..2f7a3c090489 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -202,8 +202,7 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) } else { acl = ERR_PTR(rc); } - if (value) - kfree(value); + kfree(value); if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); return acl; diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 0b9a1e44e833..5698dae5d92d 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in, while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); - def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); - jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in = min_t(unsigned long, + (*sourcelen-def_strm.total_in), def_strm.avail_out); + jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n", def_strm.avail_in, def_strm.avail_out); ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); - jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n", def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out); if (ret != Z_OK) { diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index ad0f2e2a1700..d72817ac51f6 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -756,8 +756,7 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) for (i=0; i < XATTRINDEX_HASHSIZE; i++) { list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { list_del(&xd->xindex); - if (xd->xname) - kfree(xd->xname); + kfree(xd->xname); jffs2_free_xattr_datum(xd); } } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index d895b4b7b661..4429d6d9217f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -896,7 +896,7 @@ const struct file_operations kernfs_file_fops = { * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file - * @static_name: don't copy file name + * @name_is_static: don't copy file name * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. diff --git a/fs/libfs.c b/fs/libfs.c index 88e3e00e2eca..171d2846f2a3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1075,3 +1075,21 @@ struct inode *alloc_anon_inode(struct super_block *s) return inode; } EXPORT_SYMBOL(alloc_anon_inode); + +/** + * simple_nosetlease - generic helper for prohibiting leases + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: new lease supplied for insertion + * @priv: private data for lm_setup operation + * + * Generic helper for filesystems that do not wish to allow leases to be set. + * All arguments are ignored and it just returns -EINVAL. + */ +int +simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) +{ + return -EINVAL; +} +EXPORT_SYMBOL(simple_nosetlease); diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index ca58d64374ca..9b320cc2a8cf 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o + svcshare.o svcproc.o svcsubs.o mon.o xdr.o lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-objs-$(CONFIG_PROC_FS) += procfs.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1812f026960c..9106f42c472c 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + if (status == -ECONNREFUSED) { + dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n", + status); + rpc_force_rebind(clnt); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + } if (status < 0) dprintk("lockd: NSM upcall RPC failed, status=%d\n", status); @@ -306,11 +312,9 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) static void nsm_init_private(struct nsm_handle *nsm) { u64 *p = (u64 *)&nsm->sm_priv.data; - struct timespec ts; s64 ns; - ktime_get_ts(&ts); - ns = timespec_to_ns(&ts); + ns = ktime_get_ns(); put_unaligned(ns, p); put_unaligned((unsigned long)nsm, p + 1); } diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5010b55628b4..097bfa3adb1c 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -11,7 +11,6 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; - struct list_head grace_list; spinlock_t nsm_clnt_lock; unsigned int nsm_users; diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c new file mode 100644 index 000000000000..2a0a98480e39 --- /dev/null +++ b/fs/lockd/procfs.c @@ -0,0 +1,92 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com> + */ + +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> + +#include "netns.h" +#include "procfs.h" + +/* + * We only allow strings that start with 'Y', 'y', or '1'. + */ +static ssize_t +nlm_end_grace_write(struct file *file, const char __user *buf, size_t size, + loff_t *pos) +{ + char *data; + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + + if (size < 1) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + switch(data[0]) { + case 'Y': + case 'y': + case '1': + locks_end_grace(&ln->lockd_manager); + break; + default: + return -EINVAL; + } + + return size; +} + +static ssize_t +nlm_end_grace_read(struct file *file, char __user *buf, size_t size, + loff_t *pos) +{ + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + char resp[3]; + + resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N'; + resp[1] = '\n'; + resp[2] = '\0'; + + return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); +} + +static const struct file_operations lockd_end_grace_operations = { + .write = nlm_end_grace_write, + .read = nlm_end_grace_read, + .llseek = default_llseek, + .release = simple_transaction_release, + .owner = THIS_MODULE, +}; + +int __init +lockd_create_procfs(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/lockd", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, + &lockd_end_grace_operations); + if (!entry) { + remove_proc_entry("fs/lockd", NULL); + return -ENOMEM; + } + return 0; +} + +void __exit +lockd_remove_procfs(void) +{ + remove_proc_entry("fs/lockd/nlm_end_grace", NULL); + remove_proc_entry("fs/lockd", NULL); +} diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h new file mode 100644 index 000000000000..2257a1311027 --- /dev/null +++ b/fs/lockd/procfs.h @@ -0,0 +1,28 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com> + */ +#ifndef _LOCKD_PROCFS_H +#define _LOCKD_PROCFS_H + +#include <linux/kconfig.h> + +#if IS_ENABLED(CONFIG_PROC_FS) +int lockd_create_procfs(void); +void lockd_remove_procfs(void); +#else +static inline int +lockd_create_procfs(void) +{ + return 0; +} + +static inline void +lockd_remove_procfs(void) +{ + return; +} +#endif /* IS_ENABLED(CONFIG_PROC_FS) */ + +#endif /* _LOCKD_PROCFS_H */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 8f27c93f8d2e..d1bb7ecfd201 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -36,6 +36,7 @@ #include <linux/nfs.h> #include "netns.h" +#include "procfs.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) @@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) error = make_socks(serv, net); if (error < 0) - goto err_socks; + goto err_bind; set_grace_period(net); dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; -err_socks: - svc_rpcb_cleanup(serv, net); err_bind: ln->nlmsvc_users--; return error; @@ -306,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); + nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING "lockd_up: kthread_run failed, error=%d\n", error); goto out_task; } + nlmsvc_rqst->rq_task = nlmsvc_task; + wake_up_process(nlmsvc_task); + dprintk("lockd_up: service started\n"); return 0; @@ -583,7 +585,7 @@ static int lockd_init_net(struct net *net) struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); - INIT_LIST_HEAD(&ln->grace_list); + INIT_LIST_HEAD(&ln->lockd_manager.list); spin_lock_init(&ln->nsm_clnt_lock); return 0; } @@ -617,8 +619,15 @@ static int __init init_nlm(void) err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; + + err = lockd_create_procfs(); + if (err) + goto err_procfs; + return 0; +err_procfs: + unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); @@ -631,6 +640,7 @@ static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); + lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index ab798a88ec1d..13db95f54176 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -245,7 +245,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, block->b_daemon = rqstp->rq_server; block->b_host = host; block->b_file = file; - block->b_fl = NULL; file->f_count++; /* Add to file's list of blocks */ @@ -295,7 +294,6 @@ static void nlmsvc_free_block(struct kref *kref) nlmsvc_freegrantargs(block->b_call); nlmsvc_release_call(block->b_call); nlm_release_file(block->b_file); - kfree(block->b_fl); kfree(block); } @@ -508,7 +506,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, struct nlm_lock *conflock, struct nlm_cookie *cookie) { - struct nlm_block *block = NULL; int error; __be32 ret; @@ -519,63 +516,26 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - /* Get existing block (in case client is busy-waiting) */ - block = nlmsvc_lookup_block(file, lock); - - if (block == NULL) { - struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - - if (conf == NULL) - return nlm_granted; - block = nlmsvc_create_block(rqstp, host, file, lock, cookie); - if (block == NULL) { - kfree(conf); - return nlm_granted; - } - block->b_fl = conf; - } - if (block->b_flags & B_QUEUED) { - dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n", - block, block->b_flags, block->b_fl); - if (block->b_flags & B_TIMED_OUT) { - nlmsvc_unlink_block(block); - ret = nlm_lck_denied; - goto out; - } - if (block->b_flags & B_GOT_CALLBACK) { - nlmsvc_unlink_block(block); - if (block->b_fl != NULL - && block->b_fl->fl_type != F_UNLCK) { - lock->fl = *block->b_fl; - goto conf_lock; - } else { - ret = nlm_granted; - goto out; - } - } - ret = nlm_drop_reply; - goto out; - } - if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } + error = vfs_test_lock(file->f_file, &lock->fl); - if (error == FILE_LOCK_DEFERRED) { - ret = nlmsvc_defer_lock_rqst(rqstp, block); - goto out; - } if (error) { + /* We can't currently deal with deferred test requests */ + if (error == FILE_LOCK_DEFERRED) + WARN_ON_ONCE(1); + ret = nlm_lck_denied_nolocks; goto out; } + if (lock->fl.fl_type == F_UNLCK) { ret = nlm_granted; goto out; } -conf_lock: dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -586,10 +546,9 @@ conf_lock: conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; + locks_release_private(&lock->fl); ret = nlm_lck_denied; out: - if (block) - nlmsvc_release_block(block); return ret; } @@ -660,29 +619,22 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l * This is a callback from the filesystem for VFS file lock requests. * It will be used if lm_grant is defined and the filesystem can not * respond to the request immediately. - * For GETLK request it will copy the reply to the nlm_block. * For SETLK or SETLKW request it will get the local posix lock. * In all cases it will move the block to the head of nlm_blocked q where * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the * deferred rpc for GETLK and SETLK. */ static void -nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, - int result) +nlmsvc_update_deferred_block(struct nlm_block *block, int result) { block->b_flags |= B_GOT_CALLBACK; if (result == 0) block->b_granted = 1; else block->b_flags |= B_TIMED_OUT; - if (conf) { - if (block->b_fl) - __locks_copy_lock(block->b_fl, conf); - } } -static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, - int result) +static int nlmsvc_grant_deferred(struct file_lock *fl, int result) { struct nlm_block *block; int rc = -ENOENT; @@ -697,7 +649,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, rc = -ENOLCK; break; } - nlmsvc_update_deferred_block(block, conf, result); + nlmsvc_update_deferred_block(block, result); } else if (result == 0) block->b_granted = 1; diff --git a/fs/locks.c b/fs/locks.c index 717fbc404e6b..735b8d3fa78c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -230,8 +230,12 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; } - fl->fl_lmops = NULL; + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_put_owner) + fl->fl_lmops->lm_put_owner(fl); + fl->fl_lmops = NULL; + } } EXPORT_SYMBOL_GPL(locks_release_private); @@ -247,6 +251,18 @@ void locks_free_lock(struct file_lock *fl) } EXPORT_SYMBOL(locks_free_lock); +static void +locks_dispose_list(struct list_head *dispose) +{ + struct file_lock *fl; + + while (!list_empty(dispose)) { + fl = list_first_entry(dispose, struct file_lock, fl_block); + list_del_init(&fl->fl_block); + locks_free_lock(fl); + } +} + void locks_init_lock(struct file_lock *fl) { memset(fl, 0, sizeof(struct file_lock)); @@ -255,21 +271,10 @@ void locks_init_lock(struct file_lock *fl) EXPORT_SYMBOL(locks_init_lock); -static void locks_copy_private(struct file_lock *new, struct file_lock *fl) -{ - if (fl->fl_ops) { - if (fl->fl_ops->fl_copy_lock) - fl->fl_ops->fl_copy_lock(new, fl); - new->fl_ops = fl->fl_ops; - } - if (fl->fl_lmops) - new->fl_lmops = fl->fl_lmops; -} - /* * Initialize a new lock from an existing file_lock structure. */ -void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) +void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; @@ -278,21 +283,30 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) new->fl_type = fl->fl_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; + new->fl_lmops = fl->fl_lmops; new->fl_ops = NULL; - new->fl_lmops = NULL; + + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_get_owner) + fl->fl_lmops->lm_get_owner(new, fl); + } } -EXPORT_SYMBOL(__locks_copy_lock); +EXPORT_SYMBOL(locks_copy_conflock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { - locks_release_private(new); + /* "new" must be a freshly-initialized lock */ + WARN_ON_ONCE(new->fl_ops); + + locks_copy_conflock(new, fl); - __locks_copy_lock(new, fl); new->fl_file = fl->fl_file; new->fl_ops = fl->fl_ops; - new->fl_lmops = fl->fl_lmops; - locks_copy_private(new, fl); + if (fl->fl_ops) { + if (fl->fl_ops->fl_copy_lock) + fl->fl_ops->fl_copy_lock(new, fl); + } } EXPORT_SYMBOL(locks_copy_lock); @@ -312,27 +326,27 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static int flock_make_lock(struct file *filp, struct file_lock **lock, - unsigned int cmd) +static struct file_lock * +flock_make_lock(struct file *filp, unsigned int cmd) { struct file_lock *fl; int type = flock_translate_cmd(cmd); + if (type < 0) - return type; + return ERR_PTR(type); fl = locks_alloc_lock(); if (fl == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); fl->fl_file = filp; - fl->fl_owner = (fl_owner_t)filp; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - *lock = fl; - return 0; + return fl; } static int assign_type(struct file_lock *fl, long type) @@ -413,14 +427,34 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, } /* default lease lock manager operations */ -static void lease_break_callback(struct file_lock *fl) +static bool +lease_break_callback(struct file_lock *fl) { kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); + return false; +} + +static void +lease_setup(struct file_lock *fl, void **priv) +{ + struct file *filp = fl->fl_file; + struct fasync_struct *fa = *priv; + + /* + * fasync_insert_entry() returns the old entry if any. If there was no + * old entry, then it used "priv" and inserted it into the fasync list. + * Clear the pointer to indicate that it shouldn't be freed. + */ + if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa)) + *priv = NULL; + + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); } static const struct lock_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, + .lm_setup = lease_setup, }; /* @@ -431,7 +465,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = (fl_owner_t)current->files; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -650,12 +684,16 @@ static void locks_unlink_lock(struct file_lock **thisfl_p) * * Must be called with i_lock held! */ -static void locks_delete_lock(struct file_lock **thisfl_p) +static void locks_delete_lock(struct file_lock **thisfl_p, + struct list_head *dispose) { struct file_lock *fl = *thisfl_p; locks_unlink_lock(thisfl_p); - locks_free_lock(fl); + if (dispose) + list_add(&fl->fl_block, dispose); + else + locks_free_lock(fl); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality @@ -718,7 +756,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) break; } if (cfl) { - __locks_copy_lock(fl, cfl); + locks_copy_conflock(fl, cfl); if (cfl->fl_nspid) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else @@ -811,6 +849,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) struct inode * inode = file_inode(filp); int error = 0; int found = 0; + LIST_HEAD(dispose); if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { new_fl = locks_alloc_lock(); @@ -833,7 +872,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) if (request->fl_type == fl->fl_type) goto out; found = 1; - locks_delete_lock(before); + locks_delete_lock(before, &dispose); break; } @@ -880,6 +919,7 @@ out: spin_unlock(&inode->i_lock); if (new_fl) locks_free_lock(new_fl); + locks_dispose_list(&dispose); return error; } @@ -893,6 +933,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str struct file_lock **before; int error; bool added = false; + LIST_HEAD(dispose); /* * We may need two file_lock structures for this operation, @@ -921,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (!posix_locks_conflict(request, fl)) continue; if (conflock) - __locks_copy_lock(conflock, fl); + locks_copy_conflock(conflock, fl); error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; @@ -988,7 +1029,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str else request->fl_end = fl->fl_end; if (added) { - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } request = fl; @@ -1018,21 +1059,24 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str * one (This may happen several times). */ if (added) { - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } - /* Replace the old lock with the new one. - * Wake up anybody waiting for the old one, - * as the change in lock type might satisfy - * their needs. + /* + * Replace the old lock with new_fl, and + * remove the old one. It's safe to do the + * insert here since we know that we won't be + * using new_fl later, and that the lock is + * just replacing an existing lock. */ - locks_wake_up_blocks(fl); - fl->fl_start = request->fl_start; - fl->fl_end = request->fl_end; - fl->fl_type = request->fl_type; - locks_release_private(fl); - locks_copy_private(fl, request); - request = fl; + error = -ENOLCK; + if (!new_fl) + goto out; + locks_copy_lock(new_fl, request); + request = new_fl; + new_fl = NULL; + locks_delete_lock(before, &dispose); + locks_insert_lock(before, request); added = true; } } @@ -1093,6 +1137,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_free_lock(new_fl); if (new_fl2) locks_free_lock(new_fl2); + locks_dispose_list(&dispose); return error; } @@ -1155,7 +1200,6 @@ EXPORT_SYMBOL(posix_lock_file_wait); int locks_mandatory_locked(struct file *file) { struct inode *inode = file_inode(file); - fl_owner_t owner = current->files; struct file_lock *fl; /* @@ -1165,7 +1209,8 @@ int locks_mandatory_locked(struct file *file) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) + if (fl->fl_owner != current->files && + fl->fl_owner != file) break; } spin_unlock(&inode->i_lock); @@ -1205,7 +1250,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { if (filp) { - fl.fl_owner = (fl_owner_t)filp; + fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; error = __posix_lock_file(inode, &fl, NULL); if (!error) @@ -1249,7 +1294,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg) } /* We already had a lease on this file; just change its type */ -int lease_modify(struct file_lock **before, int arg) +int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) { struct file_lock *fl = *before; int error = assign_type(fl, arg); @@ -1268,11 +1313,10 @@ int lease_modify(struct file_lock **before, int arg) printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock(before); + locks_delete_lock(before, dispose); } return 0; } - EXPORT_SYMBOL(lease_modify); static bool past_time(unsigned long then) @@ -1283,18 +1327,20 @@ static bool past_time(unsigned long then) return time_after(jiffies, then); } -static void time_out_leases(struct inode *inode) +static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock **before; struct file_lock *fl; + lockdep_assert_held(&inode->i_lock); + before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) - lease_modify(before, F_RDLCK); + lease_modify(before, F_RDLCK, dispose); if (past_time(fl->fl_break_time)) - lease_modify(before, F_UNLCK); + lease_modify(before, F_UNLCK, dispose); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1307,6 +1353,20 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) return locks_conflict(breaker, lease); } +static bool +any_leases_conflict(struct inode *inode, struct file_lock *breaker) +{ + struct file_lock *fl; + + lockdep_assert_held(&inode->i_lock); + + for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) { + if (leases_conflict(fl, breaker)) + return true; + } + return false; +} + /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return @@ -1323,12 +1383,11 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock *new_fl, *flock; - struct file_lock *fl; + struct file_lock *new_fl; + struct file_lock *fl, **before; unsigned long break_time; - int i_have_this_lease = 0; - bool lease_conflict = false; int want_write = (mode & O_ACCMODE) != O_RDONLY; + LIST_HEAD(dispose); new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) @@ -1337,20 +1396,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) spin_lock(&inode->i_lock); - time_out_leases(inode); - - flock = inode->i_flock; - if ((flock == NULL) || !IS_LEASE(flock)) - goto out; + time_out_leases(inode, &dispose); - for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (leases_conflict(fl, new_fl)) { - lease_conflict = true; - if (fl->fl_owner == current->files) - i_have_this_lease = 1; - } - } - if (!lease_conflict) + if (!any_leases_conflict(inode, new_fl)) goto out; break_time = 0; @@ -1360,7 +1408,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { if (!leases_conflict(fl, new_fl)) continue; if (want_write) { @@ -1369,51 +1419,56 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { - if (lease_breaking(flock)) + if (lease_breaking(inode->i_flock)) continue; fl->fl_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } - fl->fl_lmops->lm_break(fl); + if (fl->fl_lmops->lm_break(fl)) + locks_delete_lock(before, &dispose); } - if (i_have_this_lease || (mode & O_NONBLOCK)) { + fl = inode->i_flock; + if (!fl || !IS_LEASE(fl)) + goto out; + + if (mode & O_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } restart: - break_time = flock->fl_break_time; + break_time = inode->i_flock->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(flock, new_fl); + locks_insert_block(inode->i_flock, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); spin_lock(&inode->i_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { - if (error == 0) - time_out_leases(inode); /* * Wait for the next conflicting lease that has not been * broken yet */ - for (flock = inode->i_flock; flock && IS_LEASE(flock); - flock = flock->fl_next) { - if (leases_conflict(new_fl, flock)) - goto restart; - } + if (error == 0) + time_out_leases(inode, &dispose); + if (any_leases_conflict(inode, new_fl)) + goto restart; + error = 0; } out: spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; } @@ -1431,8 +1486,18 @@ EXPORT_SYMBOL(__break_lease); */ void lease_get_mtime(struct inode *inode, struct timespec *time) { - struct file_lock *flock = inode->i_flock; - if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) + bool has_lease = false; + struct file_lock *flock; + + if (inode->i_flock) { + spin_lock(&inode->i_lock); + flock = inode->i_flock; + if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) + has_lease = true; + spin_unlock(&inode->i_lock); + } + + if (has_lease) *time = current_fs_time(inode->i_sb); else *time = inode->i_mtime; @@ -1468,9 +1533,10 @@ int fcntl_getlease(struct file *filp) struct file_lock *fl; struct inode *inode = file_inode(filp); int type = F_UNLCK; + LIST_HEAD(dispose); spin_lock(&inode->i_lock); - time_out_leases(file_inode(filp)); + time_out_leases(file_inode(filp), &dispose); for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { @@ -1479,6 +1545,7 @@ int fcntl_getlease(struct file *filp) } } spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); return type; } @@ -1508,13 +1575,15 @@ check_conflicting_open(const struct dentry *dentry, const long arg) return ret; } -static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) +static int +generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; + LIST_HEAD(dispose); lease = *flp; trace_generic_add_lease(inode, lease); @@ -1537,6 +1606,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp return -EINVAL; } + spin_lock(&inode->i_lock); + time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg); if (error) goto out; @@ -1572,10 +1643,11 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp } if (my_before != NULL) { - error = lease->fl_lmops->lm_change(my_before, arg); - if (!error) - *flp = *my_before; - goto out; + lease = *my_before; + error = lease->fl_lmops->lm_change(my_before, arg, &dispose); + if (error) + goto out; + goto out_setup; } error = -EINVAL; @@ -1595,43 +1667,61 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp smp_mb(); error = check_conflicting_open(dentry, arg); if (error) - locks_unlink_lock(flp); + goto out_unlink; + +out_setup: + if (lease->fl_lmops->lm_setup) + lease->fl_lmops->lm_setup(lease, priv); out: + spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); if (is_deleg) mutex_unlock(&inode->i_mutex); + if (!error && !my_before) + *flp = NULL; return error; +out_unlink: + locks_unlink_lock(before); + goto out; } -static int generic_delete_lease(struct file *filp, struct file_lock **flp) +static int generic_delete_lease(struct file *filp) { + int error = -EAGAIN; struct file_lock *fl, **before; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + LIST_HEAD(dispose); - trace_generic_delete_lease(inode, *flp); - + spin_lock(&inode->i_lock); + time_out_leases(inode, &dispose); for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file != filp) - continue; - return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + if (fl->fl_file == filp) + break; } - return -EAGAIN; + trace_generic_delete_lease(inode, fl); + if (fl) + error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose); + spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); + return error; } /** * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * @priv: private data for lm_setup (may be NULL if lm_setup + * doesn't require it) * * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). - * - * Called with inode->i_lock held. */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_setlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -1645,83 +1735,52 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if (error) return error; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->lm_break); - switch (arg) { case F_UNLCK: - return generic_delete_lease(filp, flp); + return generic_delete_lease(filp); case F_RDLCK: case F_WRLCK: - return generic_add_lease(filp, arg, flp); + if (!(*flp)->fl_lmops->lm_break) { + WARN_ON_ONCE(1); + return -ENOLCK; + } + return generic_add_lease(filp, arg, flp, priv); default: return -EINVAL; } } EXPORT_SYMBOL(generic_setlease); -static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) -{ - if (filp->f_op->setlease) - return filp->f_op->setlease(filp, arg, lease); - else - return generic_setlease(filp, arg, lease); -} - /** - * vfs_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @lease: file_lock to use - * - * Call this to establish a lease on the file. - * The (*lease)->fl_lmops->lm_break operation must be set; if not, - * break_lease will oops! - * - * This will call the filesystem's setlease file method, if - * defined. Note that there is no getlease method; instead, the - * filesystem setlease method should call back to setlease() to - * add a lease to the inode's lease list, where fcntl_getlease() can - * find it. Since fcntl_getlease() only reports whether the current - * task holds a lease, a cluster filesystem need only do this for - * leases held by processes on this node. - * - * There is also no break_lease method; filesystems that - * handle their own leases should break leases themselves from the - * filesystem's open, create, and (on truncate) setattr methods. - * - * Warning: the only current setlease methods exist only to disable - * leases in certain cases. More vfs changes may be required to - * allow a full filesystem lease implementation. + * vfs_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @lease: file_lock to use when adding a lease + * @priv: private info for lm_setup when adding a lease (may be + * NULL if lm_setup doesn't require it) + * + * Call this to establish a lease on the file. The "lease" argument is not + * used for F_UNLCK requests and may be NULL. For commands that set or alter + * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; + * if not, this function will return -ENOLCK (and generate a scary-looking + * stack trace). + * + * The "priv" pointer is passed directly to the lm_setup function as-is. It + * may be NULL if the lm_setup operation doesn't require it. */ - -int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) +int +vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - struct inode *inode = file_inode(filp); - int error; - - spin_lock(&inode->i_lock); - error = __vfs_setlease(filp, arg, lease); - spin_unlock(&inode->i_lock); - - return error; + if (filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease, priv); + else + return generic_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_delete_lease(struct file *filp) -{ - struct file_lock fl, *flp = &fl; - - lease_init(filp, F_UNLCK, flp); - - return vfs_setlease(filp, F_UNLCK, &flp); -} - static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { - struct file_lock *fl, *ret; - struct inode *inode = file_inode(filp); + struct file_lock *fl; struct fasync_struct *new; int error; @@ -1734,30 +1793,11 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } - ret = fl; - spin_lock(&inode->i_lock); - error = __vfs_setlease(filp, arg, &ret); - if (error) { - spin_unlock(&inode->i_lock); - locks_free_lock(fl); - goto out_free_fasync; - } - if (ret != fl) - locks_free_lock(fl); - - /* - * fasync_insert_entry() returns the old entry if any. - * If there was no old entry, then it used 'new' and - * inserted it into the fasync list. Clear new so that - * we don't release it here. - */ - if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) - new = NULL; - - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - spin_unlock(&inode->i_lock); + new->fa_fd = fd; -out_free_fasync: + error = vfs_setlease(filp, arg, &fl, (void **)&new); + if (fl) + locks_free_lock(fl); if (new) fasync_free(new); return error; @@ -1776,7 +1816,7 @@ out_free_fasync: int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { if (arg == F_UNLCK) - return do_fcntl_delete_lease(filp); + return vfs_setlease(filp, F_UNLCK, NULL, NULL); return do_fcntl_add_lease(fd, filp, arg); } @@ -1845,9 +1885,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) goto out_putf; - error = flock_make_lock(f.file, &lock, cmd); - if (error) + lock = flock_make_lock(f.file, cmd); + if (IS_ERR(lock)) { + error = PTR_ERR(lock); goto out_putf; + } + if (can_sleep) lock->fl_flags |= FL_SLEEP; @@ -1948,7 +1991,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) cmd = F_GETLK; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -1959,11 +2002,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (file_lock.fl_type != F_UNLCK) { error = posix_lock_to_flock(&flock, &file_lock); if (error) - goto out; + goto rel_priv; } error = -EFAULT; if (!copy_to_user(l, &flock, sizeof(flock))) error = 0; +rel_priv: + locks_release_private(&file_lock); out: return error; } @@ -2103,7 +2148,7 @@ again: cmd = F_SETLK; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2112,7 +2157,7 @@ again: cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; @@ -2170,7 +2215,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) cmd = F_GETLK64; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2184,7 +2229,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) error = -EFAULT; if (!copy_to_user(l, &flock, sizeof(flock))) error = 0; - + + locks_release_private(&file_lock); out: return error; } @@ -2242,7 +2288,7 @@ again: cmd = F_SETLK64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2251,7 +2297,7 @@ again: cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; @@ -2320,15 +2366,16 @@ void locks_remove_file(struct file *filp) struct inode * inode = file_inode(filp); struct file_lock *fl; struct file_lock **before; + LIST_HEAD(dispose); if (!inode->i_flock) return; - locks_remove_posix(filp, (fl_owner_t)filp); + locks_remove_posix(filp, filp); if (filp->f_op->flock) { struct file_lock fl = { - .fl_owner = (fl_owner_t)filp, + .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, @@ -2346,7 +2393,7 @@ void locks_remove_file(struct file *filp) while ((fl = *before) != NULL) { if (fl->fl_file == filp) { if (IS_LEASE(fl)) { - lease_modify(before, F_UNLCK); + lease_modify(before, F_UNLCK, &dispose); continue; } @@ -2365,12 +2412,13 @@ void locks_remove_file(struct file *filp) fl->fl_type, fl->fl_flags, fl->fl_start, fl->fl_end); - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } before = &fl->fl_next; } spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); } /** @@ -2452,7 +2500,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_puts(f, "FLOCK ADVISORY "); } } else if (IS_LEASE(fl)) { - seq_puts(f, "LEASE "); + if (fl->fl_flags & FL_DELEG) + seq_puts(f, "DELEG "); + else + seq_puts(f, "LEASE "); + if (lease_breaking(fl)) seq_puts(f, "BREAKING "); else if (fl->fl_file) @@ -2565,86 +2617,6 @@ static int __init proc_locks_init(void) module_init(proc_locks_init); #endif -/** - * lock_may_read - checks that the region is free of locks - * @inode: the inode that is being read - * @start: the first byte to read - * @len: the number of bytes to read - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a read and - * byte-range POSIX locks can prohibit a read if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_read(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if (fl->fl_type == F_RDLCK) - continue; - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_READ) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_read); - -/** - * lock_may_write - checks that the region is free of locks - * @inode: the inode that is being written - * @start: the first byte to write - * @len: the number of bytes to write - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a write and - * byte-range POSIX locks can prohibit a write if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_write(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_WRITE) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_write); - static int __init filelock_init(void) { int i; diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 48140315f627..380d86e1ab45 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) /** * logfs_is_valid_block - check whether this block is still valid * - * @sb - superblock - * @ofs - block physical offset - * @ino - block inode number - * @bix - block index - * @level - block level + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level * * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will * become invalid once the journal is written. @@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block) * * @inode: parent inode (ifile or directory) * @buf: object to write (inode or dentry) - * @n: object size - * @_pos: object number (file position in blocks/objects) + * @count: object size + * @bix: block index * @flags: write flags - * @lock: 0 if write lock is already taken, 1 otherwise * @shadow_tree: shadow below this inode * * FIXME: All caller of this put a 200-300 byte variable on the stack, diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 4bc50dac8e97..742942a983be 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -96,7 +96,7 @@ int minix_new_block(struct inode * inode) unsigned long minix_count_free_blocks(struct super_block *sb) { struct minix_sb_info *sbi = minix_sb(sb); - u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1; return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f007a3355570..3f57af196a7d 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -267,12 +267,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); if (sbi->s_imap_blocks < block) { printk("MINIX-fs: file system does not have enough " - "imap blocks allocated. Refusing to mount\n"); + "imap blocks allocated. Refusing to mount.\n"); goto out_no_bitmap; } block = minix_blocks_needed( - (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + (sbi->s_nzones - sbi->s_firstdatazone + 1), s->s_blocksize); if (sbi->s_zmap_blocks < block) { printk("MINIX-fs: file system does not have enough " diff --git a/fs/mount.h b/fs/mount.h index d55297f2fa05..6740a6215529 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -55,7 +55,7 @@ struct mount { int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; + struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/mpage.c b/fs/mpage.c index 5f9ed622274f..3e79220babac 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -28,6 +28,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/cleancache.h> +#include "internal.h" /* * I/O completion handler for multipage BIOs. @@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io; + guard_bio_eod(rw, bio); submit_bio(rw, bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index 8ae644c1150f..3ddb044f3702 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -34,6 +34,7 @@ #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> +#include <linux/hash.h> #include <asm/uaccess.h> #include "internal.h" @@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd) static __always_inline void set_root(struct nameidata *nd) { - if (!nd->root.mnt) - get_fs_root(current->fs, &nd->root); + get_fs_root(current->fs, &nd->root); } static int link_path_walk(const char *, struct nameidata *); -static __always_inline void set_root_rcu(struct nameidata *nd) +static __always_inline unsigned set_root_rcu(struct nameidata *nd) { - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; - unsigned seq; + struct fs_struct *fs = current->fs; + unsigned seq, res; - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - } + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + res = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + return res; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p) return PTR_ERR(s); } if (*s == '/') { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); @@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline bool managed_dentry_might_block(struct dentry *dentry) +static inline int managed_dentry_rcu(struct dentry *dentry) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && - dentry->d_op->d_manage(dentry, true) < 0); + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + dentry->d_op->d_manage(dentry, true) : 0; } /* @@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - if (unlikely(managed_dentry_might_block(path->dentry))) + switch (managed_dentry_rcu(path->dentry)) { + case -ECHILD: + default: return false; + case -EISDIR: + return true; + case 0: + break; + } if (!d_mountpoint(path->dentry)) - return true; + return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) @@ -1130,12 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return read_seqretry(&mount_lock, nd->m_seq); + return !read_seqretry(&mount_lock, nd->m_seq) && + !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static int follow_dotdot_rcu(struct nameidata *nd) { - set_root_rcu(nd); + struct inode *inode = nd->inode; + if (!nd->root.mnt) + set_root_rcu(nd); while (1) { if (nd->path.dentry == nd->root.dentry && @@ -1147,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct dentry *parent = old->d_parent; unsigned seq; + inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; @@ -1156,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) } if (!follow_up_rcu(&nd->path)) break; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } while (d_mountpoint(nd->path.dentry)) { @@ -1165,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - if (!read_seqretry(&mount_lock, nd->m_seq)) + if (read_seqretry(&mount_lock, nd->m_seq)) goto failed; } - nd->inode = nd->path.dentry->d_inode; + nd->inode = inode; return 0; failed: @@ -1248,7 +1261,8 @@ static void follow_mount(struct path *path) static void follow_dotdot(struct nameidata *nd) { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); while(1) { struct dentry *old = nd->path.dentry; @@ -1402,11 +1416,8 @@ static int lookup_fast(struct nameidata *nd, } path->mnt = mnt; path->dentry = dentry; - if (unlikely(!__follow_mount_rcu(nd, path, inode))) - goto unlazy; - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - goto unlazy; - return 0; + if (likely(__follow_mount_rcu(nd, path, inode))) + return 0; unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; @@ -1629,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) static inline unsigned int fold_hash(unsigned long hash) { - hash += hash >> (8*sizeof(int)); - return hash; + return hash_64(hash, 32); } #else /* 32-bit case */ @@ -1664,9 +1674,9 @@ EXPORT_SYMBOL(full_name_hash); /* * Calculate the length and hash of the path component, and - * return the length of the component; + * return the "hash_len" as the result. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline u64 hash_name(const char *name) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -1686,9 +1696,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) mask = create_zero_mask(adata | bdata); hash += a & zero_bytemask(mask); - *hashp = fold_hash(hash); - - return len + find_zero(mask); + len += find_zero(mask); + return hashlen_create(fold_hash(hash), len); } #else @@ -1706,7 +1715,7 @@ EXPORT_SYMBOL(full_name_hash); * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline u64 hash_name(const char *name) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; @@ -1717,8 +1726,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - *hashp = end_name_hash(hash); - return len; + return hashlen_create(end_name_hash(hash), len); } #endif @@ -1743,20 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct qstr this; - long len; + u64 hash_len; int type; err = may_lookup(nd); if (err) break; - len = hash_name(name, &this.hash); - this.name = name; - this.len = len; + hash_len = hash_name(name); type = LAST_NORM; - if (name[0] == '.') switch (len) { + if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; @@ -1770,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) break; + hash_len = this.hash_len; + name = this.name; } } - nd->last = this; + nd->last.hash_len = hash_len; + nd->last.name = name; nd->last_type = type; - if (!name[len]) + name += hashlen_len(hash_len); + if (!*name) return 0; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { - len++; - } while (unlikely(name[len] == '/')); - if (!name[len]) + name++; + } while (unlikely(*name == '/')); + if (!*name) return 0; - name += len; - err = walk_component(nd, &next, LOOKUP_FOLLOW); if (err < 0) return err; @@ -1847,7 +1855,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - set_root_rcu(nd); + nd->seq = set_root_rcu(nd); } else { set_root(nd); path_get(&nd->root); @@ -1898,7 +1906,14 @@ static int path_init(int dfd, const char *name, unsigned int flags, } nd->inode = nd->path.dentry->d_inode; - return 0; + if (!(flags & LOOKUP_RCU)) + return 0; + if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) + return 0; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return -ECHILD; } static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -4019,7 +4034,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: - * a) we can get into loop creation. Check is done in is_subdir(). + * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another @@ -4075,7 +4090,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op->rename) + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) return -EPERM; if (flags && !old_dir->i_op->rename2) @@ -4134,10 +4149,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - if (!flags) { + if (!old_dir->i_op->rename2) { error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); } else { + WARN_ON(old_dir->i_op->rename != NULL); error = old_dir->i_op->rename2(old_dir, old_dentry, new_dir, new_dentry, flags); } diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd887..ef42d9bee212 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -16,7 +16,6 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/idr.h> -#include <linux/acct.h> /* acct_auto_close_mnt */ #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ @@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt, list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } +static void attach_shadowed(struct mount *mnt, + struct mount *parent, + struct mount *shadows) +{ + if (shadows) { + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); + list_add(&mnt->mnt_child, &shadows->mnt_child); + } else { + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } +} + /* * vfsmount lock must be held for write */ @@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); - if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); - else - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -890,8 +898,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ - if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + if (flag & CL_UNPRIVILEGED) { + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; + + if (mnt->mnt.mnt_flags & MNT_READONLY) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + if (mnt->mnt.mnt_flags & MNT_NODEV) + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; + + if (mnt->mnt.mnt_flags & MNT_NOSUID) + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; + + if (mnt->mnt.mnt_flags & MNT_NOEXEC) + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; + } /* Don't allow unprivileged users to reveal what is under a mount */ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) @@ -938,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void mntput_no_expire(struct mount *mnt) { -put_again: rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -951,14 +971,6 @@ put_again: unlock_mount_hash(); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - rcu_read_unlock(); - unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt); - goto put_again; - } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); @@ -981,6 +993,8 @@ put_again: * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1008,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mnt_pin(struct vfsmount *mnt) -{ - lock_mount_hash(); - real_mount(mnt)->mnt_pinned++; - unlock_mount_hash(); -} -EXPORT_SYMBOL(mnt_pin); - -void mnt_unpin(struct vfsmount *m) +struct vfsmount *mnt_clone_internal(struct path *path) { - struct mount *mnt = real_mount(m); - lock_mount_hash(); - if (mnt->mnt_pinned) { - mnt_add_count(mnt, 1); - mnt->mnt_pinned--; - } - unlock_mount_hash(); + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; } -EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) { @@ -1213,6 +1217,11 @@ static void namespace_unlock(void) head.first->pprev = &head.first; INIT_HLIST_HEAD(&unmounted); + /* undo decrements we'd done in umount_tree() */ + hlist_for_each_entry(mnt, &head, mnt_hash) + if (mnt->mnt_ex_mountpoint.mnt) + mntget(mnt->mnt_ex_mountpoint.mnt); + up_write(&namespace_sem); synchronize_rcu(); @@ -1249,6 +1258,9 @@ void umount_tree(struct mount *mnt, int how) hlist_add_head(&p->mnt_hash, &tmp_list); } + hlist_for_each_entry(p, &tmp_list, mnt_hash) + list_del_init(&p->mnt_child); + if (how) propagate_umount(&tmp_list); @@ -1259,9 +1271,9 @@ void umount_tree(struct mount *mnt, int how) p->mnt_ns = NULL; if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); + mnt_add_count(p->mnt_parent, -1); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; @@ -1492,6 +1504,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { + struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1513,7 +1526,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + mnt_set_mountpoint(parent, p->mnt_mp, q); + if (!list_empty(&parent->mnt_mounts)) { + t = list_last_entry(&parent->mnt_mounts, + struct mount, mnt_child); + if (t->mnt_mp != p->mnt_mp) + t = NULL; + } + attach_shadowed(q, parent, t); unlock_mount_hash(); } } @@ -1896,9 +1916,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; - if (mnt->mnt_flags & MNT_LOCK_READONLY) - return -EPERM; - if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1924,6 +1941,33 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + /* Don't allow changing of locked mnt flags. + * + * No locks need to be held here while testing the various + * MNT_LOCK flags because those flags can never be cleared + * once they are set. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { + return -EPERM; + } + err = security_sb_remount(sb, data); if (err) return err; @@ -1937,7 +1981,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { lock_mount_hash(); - mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); unlock_mount_hash(); @@ -2122,7 +2166,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, */ if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { flags |= MS_NODEV; - mnt_flags |= MNT_NODEV; + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } } @@ -2436,6 +2480,14 @@ long do_mount(const char *dev_name, const char *dir_name, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + } + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); @@ -2972,13 +3024,13 @@ static void *mntns_get(struct task_struct *task) struct mnt_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->mnt_ns; get_mnt_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile index d5815505c020..3ca14c36d08b 100644 --- a/fs/nfs/blocklayout/Makefile +++ b/fs/nfs/blocklayout/Makefile @@ -2,4 +2,5 @@ # Makefile for the pNFS block layout driver kernel module # obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o -blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o + +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9b431f44fad9..5228f201d3d5 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -35,7 +35,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/bio.h> /* struct bio */ -#include <linux/buffer_head.h> /* various write calls */ #include <linux/prefetch.h> #include <linux/pagevec.h> @@ -50,40 +49,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -static void print_page(struct page *page) +static bool is_hole(struct pnfs_block_extent *be) { - dprintk("PRINTPAGE page %p\n", page); - dprintk(" PagePrivate %d\n", PagePrivate(page)); - dprintk(" PageUptodate %d\n", PageUptodate(page)); - dprintk(" PageError %d\n", PageError(page)); - dprintk(" PageDirty %d\n", PageDirty(page)); - dprintk(" PageReferenced %d\n", PageReferenced(page)); - dprintk(" PageLocked %d\n", PageLocked(page)); - dprintk(" PageWriteback %d\n", PageWriteback(page)); - dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); - dprintk("\n"); -} - -/* Given the be associated with isect, determine if page data needs to be - * initialized. - */ -static int is_hole(struct pnfs_block_extent *be, sector_t isect) -{ - if (be->be_state == PNFS_BLOCK_NONE_DATA) - return 1; - else if (be->be_state != PNFS_BLOCK_INVALID_DATA) - return 0; - else - return !bl_is_sector_init(be->be_inval, isect); -} - -/* Given the be associated with isect, determine if page data can be - * written to disk. - */ -static int is_writable(struct pnfs_block_extent *be, sector_t isect) -{ - return (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA); + switch (be->be_state) { + case PNFS_BLOCK_NONE_DATA: + return true; + case PNFS_BLOCK_INVALID_DATA: + return be->be_tag ? false : true; + default: + return false; + } } /* The data we are handed might be spread across several bios. We need @@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - void (*pnfs_callback) (void *data, int num_se); + void (*pnfs_callback) (void *data); void *data; - int bse_count; }; static inline struct parallel_io *alloc_parallel(void *data) @@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data) if (rv) { rv->data = data; kref_init(&rv->refcnt); - rv->bse_count = 0; } return rv; } @@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref) struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); dprintk("%s enter\n", __func__); - p->pnfs_callback(p->data, p->bse_count); + p->pnfs_callback(p->data); kfree(p); } @@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio) return NULL; } -static struct bio *bl_alloc_init_bio(int npg, sector_t isect, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) +static struct bio * +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, + void (*end_io)(struct bio *, int err), struct parallel_io *par) { struct bio *bio; @@ -156,67 +128,73 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_iter.bi_sector = isect - be->be_f_offset + - be->be_v_offset; - bio->bi_bdev = be->be_mdev; + bio->bi_iter.bi_sector = disk_sector; + bio->bi_bdev = bdev; bio->bi_end_io = end_io; bio->bi_private = par; } return bio; } -static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par, - unsigned int offset, int len) +static struct bio * +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, + struct page *page, struct pnfs_block_dev_map *map, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par, unsigned int offset, int *len) { - isect = isect + (offset >> SECTOR_SHIFT); + struct pnfs_block_dev *dev = + container_of(be->be_device, struct pnfs_block_dev, node); + u64 disk_addr, end; + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, len); + npg, rw, (unsigned long long)isect, offset, *len); + + /* translate to device offset */ + isect += be->be_v_offset; + isect -= be->be_f_offset; + + /* translate to physical disk offset */ + disk_addr = (u64)isect << SECTOR_SHIFT; + if (disk_addr < map->start || disk_addr >= map->start + map->len) { + if (!dev->map(dev, disk_addr, map)) + return ERR_PTR(-EIO); + bio = bl_submit_bio(rw, bio); + } + disk_addr += map->disk_offset; + disk_addr -= map->start; + + /* limit length to what the device mapping allows */ + end = disk_addr + *len; + if (end >= map->start + map->len) + *len = map->start + map->len - disk_addr; + retry: if (!bio) { - bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + bio = bl_alloc_init_bio(npg, map->bdev, + disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); } - if (bio_add_page(bio, page, len, offset) < len) { + if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(rw, bio); goto retry; } return bio; } -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) -{ - return do_add_page_to_bio(bio, npg, rw, isect, page, be, - end_io, par, 0, PAGE_CACHE_SIZE); -} - -/* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - if (!err) - bio_for_each_segment_all(bvec, bio, i) - SetPageUptodate(bvec->bv_page); if (err) { - struct nfs_pgio_data *rdata = par->data; - struct nfs_pgio_header *header = rdata->header; + struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); } + bio_put(bio); put_parallel(par); } @@ -224,104 +202,96 @@ static void bl_end_io_read(struct bio *bio, int err) static void bl_read_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_pgio_data *rdata; + struct nfs_pgio_header *hdr; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_pgio_data, task); - pnfs_ld_read_done(rdata); + hdr = container_of(task, struct nfs_pgio_header, task); + pnfs_ld_read_done(hdr); } static void -bl_end_par_io_read(void *data, int unused) +bl_end_par_io_read(void *data) { - struct nfs_pgio_data *rdata = data; + struct nfs_pgio_header *hdr = data; - rdata->task.tk_status = rdata->header->pnfs_error; - INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); - schedule_work(&rdata->task.u.tk_work); + hdr->task.tk_status = hdr->pnfs_error; + INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup); + schedule_work(&hdr->task.u.tk_work); } static enum pnfs_try_status -bl_read_pagelist(struct nfs_pgio_data *rdata) +bl_read_pagelist(struct nfs_pgio_header *header) { - struct nfs_pgio_header *header = rdata->header; - int i, hole; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; + struct pnfs_block_extent be; sector_t isect, extent_length = 0; struct parallel_io *par; - loff_t f_offset = rdata->args.offset; - size_t bytes_left = rdata->args.count; + loff_t f_offset = header->args.offset; + size_t bytes_left = header->args.count; unsigned int pg_offset, pg_len; - struct page **pages = rdata->args.pages; - int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + struct page **pages = header->args.pages; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; const bool is_dio = (header->dreq != NULL); + struct blk_plug plug; + int i; dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, - rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); + header->page_array.npages, f_offset, + (unsigned int)header->args.count); - par = alloc_parallel(rdata); + par = alloc_parallel(header); if (!par) - goto use_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_read; - /* At this point, we can no longer jump to use_mds */ + + blk_start_plug(&plug); isect = (sector_t) (f_offset >> SECTOR_SHIFT); /* Code assumes extents are page-aligned */ - for (i = pg_index; i < rdata->pages.npages; i++) { - if (!extent_length) { + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(READ, bio); + /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be) { + if (!ext_tree_lookup(bl, isect, &be, false)) { header->pnfs_error = -EIO; goto out; } - extent_length = be->be_length - - (isect - be->be_f_offset); - if (cow_read) { - sector_t cow_length = cow_read->be_length - - (isect - cow_read->be_f_offset); - extent_length = min(extent_length, cow_length); - } + extent_length = be.be_length - (isect - be.be_f_offset); } + pg_offset = f_offset & ~PAGE_CACHE_MASK; if (is_dio) { - pg_offset = f_offset & ~PAGE_CACHE_MASK; if (pg_offset + bytes_left > PAGE_CACHE_SIZE) pg_len = PAGE_CACHE_SIZE - pg_offset; else pg_len = bytes_left; - - f_offset += pg_len; - bytes_left -= pg_len; - isect += (pg_offset >> SECTOR_SHIFT); } else { - pg_offset = 0; + BUG_ON(pg_offset != 0); pg_len = PAGE_CACHE_SIZE; } - hole = is_hole(be, isect); - if (hole && !cow_read) { + isect += (pg_offset >> SECTOR_SHIFT); + extent_length -= (pg_offset >> SECTOR_SHIFT); + + if (is_hole(&be)) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); - print_page(pages[i]); - SetPageUptodate(pages[i]); - } else { - struct pnfs_block_extent *be_read; - be_read = (hole && cow_read) ? cow_read : be; - bio = do_add_page_to_bio(bio, rdata->pages.npages - i, + /* invalidate map */ + map.start = NFS4_MAX_UINT64; + } else { + bio = do_add_page_to_bio(bio, + header->page_array.npages - i, READ, - isect, pages[i], be_read, + isect, pages[i], &map, &be, bl_end_io_read, par, - pg_offset, pg_len); + pg_offset, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; @@ -329,84 +299,28 @@ bl_read_pagelist(struct nfs_pgio_data *rdata) } } isect += (pg_len >> SECTOR_SHIFT); - extent_length -= PAGE_CACHE_SECTORS; + extent_length -= (pg_len >> SECTOR_SHIFT); + f_offset += pg_len; + bytes_left -= pg_len; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { - rdata->res.eof = 1; - rdata->res.count = header->inode->i_size - rdata->args.offset; + header->res.eof = 1; + header->res.count = header->inode->i_size - header->args.offset; } else { - rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; + header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(READ, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; - - use_mds: - dprintk("Giving up and using normal NFS\n"); - return PNFS_NOT_ATTEMPTED; -} - -static void mark_extents_written(struct pnfs_block_layout *bl, - __u64 offset, __u32 count) -{ - sector_t isect, end; - struct pnfs_block_extent *be; - struct pnfs_block_short_extent *se; - - dprintk("%s(%llu, %u)\n", __func__, offset, count); - if (count == 0) - return; - isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; - end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); - end >>= SECTOR_SHIFT; - while (isect < end) { - sector_t len; - be = bl_find_get_extent(bl, isect, NULL); - BUG_ON(!be); /* FIXME */ - len = min(end, be->be_f_offset + be->be_length) - isect; - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - se = bl_pop_one_short_extent(be->be_inval); - BUG_ON(!se); - bl_mark_for_commit(be, isect, len, se); - } - isect += len; - bl_put_extent(be); - } -} - -static void bl_end_io_write_zero(struct bio *bio, int err) -{ - struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - /* This is the zeroing page we added */ - end_page_writeback(bvec->bv_page); - page_cache_release(bvec->bv_page); - } - - if (unlikely(err)) { - struct nfs_pgio_data *data = par->data; - struct nfs_pgio_header *header = data->header; - - if (!header->pnfs_error) - header->pnfs_error = -EIO; - pnfs_set_lo_fail(header->lseg); - } - bio_put(bio); - put_parallel(par); } static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nfs_pgio_data *data = par->data; - struct nfs_pgio_header *header = data->header; + struct nfs_pgio_header *header = par->data; if (!uptodate) { if (!header->pnfs_error) @@ -422,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err) */ static void bl_write_cleanup(struct work_struct *work) { - struct rpc_task *task; - struct nfs_pgio_data *wdata; - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_pgio_data, task); - if (likely(!wdata->header->pnfs_error)) { - /* Marks for LAYOUTCOMMIT */ - mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), - wdata->args.offset, wdata->args.count); - } - pnfs_ld_write_done(wdata); -} - -/* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void bl_end_par_io_write(void *data, int num_se) -{ - struct nfs_pgio_data *wdata = data; - - if (unlikely(wdata->header->pnfs_error)) { - bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, - num_se); - } - - wdata->task.tk_status = wdata->header->pnfs_error; - wdata->verf.committed = NFS_FILE_SYNC; - INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); - schedule_work(&wdata->task.u.tk_work); -} - -/* FIXME STUB - mark intersection of layout and page as bad, so is not - * used again. - */ -static void mark_bad_read(void) -{ - return; -} - -/* - * map_block: map a requested I/0 block (isect) into an offset in the LVM - * block_device - */ -static void -map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) -{ - dprintk("%s enter be=%p\n", __func__, be); - - set_buffer_mapped(bh); - bh->b_bdev = be->be_mdev; - bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> - (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); - - dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", - __func__, (unsigned long long)isect, (long)bh->b_blocknr, - bh->b_size); - return; -} - -static void -bl_read_single_end_io(struct bio *bio, int error) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct page *page = bvec->bv_page; - - /* Only one page in bvec */ - unlock_page(page); -} - -static int -bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int offset, unsigned int len) -{ - struct bio *bio; - struct page *shadow_page; - sector_t isect; - char *kaddr, *kshadow_addr; - int ret = 0; + struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work); + struct nfs_pgio_header *hdr = + container_of(task, struct nfs_pgio_header, task); - dprintk("%s: offset %u len %u\n", __func__, offset, len); - - shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (shadow_page == NULL) - return -ENOMEM; - - bio = bio_alloc(GFP_NOIO, 1); - if (bio == NULL) - return -ENOMEM; - - isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + - (offset / SECTOR_SIZE); - - bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; - bio->bi_bdev = be->be_mdev; - bio->bi_end_io = bl_read_single_end_io; - - lock_page(shadow_page); - if (bio_add_page(bio, shadow_page, - SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { - unlock_page(shadow_page); - bio_put(bio); - return -EIO; - } - - submit_bio(READ, bio); - wait_on_page_locked(shadow_page); - if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { - ret = -EIO; - } else { - kaddr = kmap_atomic(page); - kshadow_addr = kmap_atomic(shadow_page); - memcpy(kaddr + offset, kshadow_addr + offset, len); - kunmap_atomic(kshadow_addr); - kunmap_atomic(kaddr); - } - __free_page(shadow_page); - bio_put(bio); - - return ret; -} - -static int -bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int dirty_offset, unsigned int dirty_len, - bool full_page) -{ - int ret = 0; - unsigned int start, end; + dprintk("%s enter\n", __func__); - if (full_page) { - start = 0; - end = PAGE_CACHE_SIZE; - } else { - start = round_down(dirty_offset, SECTOR_SIZE); - end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); - } + if (likely(!hdr->pnfs_error)) { + struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg); + u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK; + u64 end = (hdr->args.offset + hdr->args.count + + PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK; - dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); - if (!be) { - zero_user_segments(page, start, dirty_offset, - dirty_offset + dirty_len, end); - if (start == 0 && end == PAGE_CACHE_SIZE && - trylock_page(page)) { - SetPageUptodate(page); - unlock_page(page); - } - return ret; + ext_tree_mark_written(bl, start >> SECTOR_SHIFT, + (end - start) >> SECTOR_SHIFT); } - if (start != dirty_offset) - ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); - - if (!ret && (dirty_offset + dirty_len < end)) - ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, - end - dirty_offset - dirty_len); - - return ret; + pnfs_ld_write_done(hdr); } -/* Given an unmapped page, zero it or read in page for COW, page is locked - * by caller. - */ -static int -init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data) { - struct buffer_head *bh = NULL; - int ret = 0; - sector_t isect; - - dprintk("%s enter, %p\n", __func__, page); - BUG_ON(PageUptodate(page)); - if (!cow_read) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - goto cleanup; - } - - bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); - if (!bh) { - ret = -ENOMEM; - goto cleanup; - } + struct nfs_pgio_header *hdr = data; - isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; - map_block(bh, isect, cow_read); - if (!bh_uptodate_or_lock(bh)) - ret = bh_submit_read(bh); - if (ret) - goto cleanup; - SetPageUptodate(page); - -cleanup: - if (bh) - free_buffer_head(bh); - if (ret) { - /* Need to mark layout with bad read...should now - * just use nfs4 for reads and writes. - */ - mark_bad_read(); - } - return ret; -} - -/* Find or create a zeroing page marked being writeback. - * Return ERR_PTR on error, NULL to indicate skip this page and page itself - * to indicate write out. - */ -static struct page * -bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, - struct pnfs_block_extent *cow_read) -{ - struct page *page; - int locked = 0; - page = find_get_page(inode->i_mapping, index); - if (page) - goto check_page; - - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s oom\n", __func__); - return ERR_PTR(-ENOMEM); - } - locked = 1; - -check_page: - /* PageDirty: Other will write this out - * PageWriteback: Other is writing this out - * PageUptodate: It was read before - */ - if (PageDirty(page) || PageWriteback(page)) { - print_page(page); - if (locked) - unlock_page(page); - page_cache_release(page); - return NULL; - } - - if (!locked) { - lock_page(page); - locked = 1; - goto check_page; - } - if (!PageUptodate(page)) { - /* New page, readin or zero it */ - init_page_for_write(page, cow_read); - } - set_page_writeback(page); - unlock_page(page); - - return page; + hdr->task.tk_status = hdr->pnfs_error; + hdr->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); + schedule_work(&hdr->task.u.tk_work); } static enum pnfs_try_status -bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) +bl_write_pagelist(struct nfs_pgio_header *header, int sync) { - struct nfs_pgio_header *header = wdata->header; - int i, ret, npg_zero, pg_index, last = 0; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; - sector_t isect, last_isect = 0, extent_length = 0; + struct pnfs_block_extent be; + sector_t isect, extent_length = 0; struct parallel_io *par = NULL; - loff_t offset = wdata->args.offset; - size_t count = wdata->args.count; - unsigned int pg_offset, pg_len, saved_len; - struct page **pages = wdata->args.pages; - struct page *page; - pgoff_t index; - u64 temp; - int npg_per_block = - NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + loff_t offset = header->args.offset; + size_t count = header->args.count; + struct page **pages = header->args.pages; + int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + unsigned int pg_len; + struct blk_plug plug; + int i; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); - if (header->dreq != NULL && - (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || - !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { - dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); - goto out_mds; - } - /* At this point, wdata->pages is a (sequential) list of nfs_pages. + /* At this point, header->page_aray is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. */ - par = alloc_parallel(wdata); + par = alloc_parallel(header); if (!par) - goto out_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_write; - /* At this point, have to be more careful with error handling */ - isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); - if (!be || !is_writable(be, isect)) { - dprintk("%s no matching extents!\n", __func__); - goto out_mds; - } + blk_start_plug(&plug); - /* First page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else - goto out_mds; - temp = offset >> PAGE_CACHE_SHIFT; - npg_zero = do_div(temp, npg_per_block); - isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & - (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - extent_length = be->be_length - (isect - be->be_f_offset); - -fill_invalid_ext: - dprintk("%s need to zero %d pages\n", __func__, npg_zero); - for (;npg_zero > 0; npg_zero--) { - if (bl_is_sector_init(be->be_inval, isect)) { - dprintk("isect %llu already init\n", - (unsigned long long)isect); - goto next_page; - } - /* page ref released in bl_end_io_write_zero */ - index = isect >> PAGE_CACHE_SECTOR_SHIFT; - dprintk("%s zero %dth page: index %lu isect %llu\n", - __func__, npg_zero, index, - (unsigned long long)isect); - page = bl_find_get_zeroing_page(header->inode, index, - cow_read); - if (unlikely(IS_ERR(page))) { - header->pnfs_error = PTR_ERR(page); - goto out; - } else if (page == NULL) - goto next_page; - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = ret; - goto out; - } - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else { - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = -ENOMEM; - goto out; - } - /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(header->lseg), - page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); - - bio = bl_add_page_to_bio(bio, npg_zero, WRITE, - isect, page, be, - bl_end_io_write_zero, par); - if (IS_ERR(bio)) { - header->pnfs_error = PTR_ERR(bio); - bio = NULL; - goto out; - } -next_page: - isect += PAGE_CACHE_SECTORS; - extent_length -= PAGE_CACHE_SECTORS; - } - if (last) - goto write_done; - } - bio = bl_submit_bio(WRITE, bio); + /* we always write out the whole page */ + offset = offset & (loff_t)PAGE_CACHE_MASK; + isect = offset >> SECTOR_SHIFT; - /* Middle pages */ - pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; - for (i = pg_index; i < wdata->pages.npages; i++) { - if (!extent_length) { + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be || !is_writable(be, isect)) { + if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; goto out; } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent( - be->be_inval))) - par->bse_count++; - else { - header->pnfs_error = -ENOMEM; - goto out; - } - } - extent_length = be->be_length - - (isect - be->be_f_offset); - } - dprintk("%s offset %lld count %Zu\n", __func__, offset, count); - pg_offset = offset & ~PAGE_CACHE_MASK; - if (pg_offset + count > PAGE_CACHE_SIZE) - pg_len = PAGE_CACHE_SIZE - pg_offset; - else - pg_len = count; - - saved_len = pg_len; - if (be->be_state == PNFS_BLOCK_INVALID_DATA && - !bl_is_sector_init(be->be_inval, isect)) { - ret = bl_read_partial_page_sync(pages[i], cow_read, - pg_offset, pg_len, true); - if (ret) { - dprintk("%s bl_read_partial_page_sync fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - - /* Expand to full page write */ - pg_offset = 0; - pg_len = PAGE_CACHE_SIZE; - } else if ((pg_offset & (SECTOR_SIZE - 1)) || - (pg_len & (SECTOR_SIZE - 1))){ - /* ahh, nasty case. We have to do sync full sector - * read-modify-write cycles. - */ - unsigned int saved_offset = pg_offset; - ret = bl_read_partial_page_sync(pages[i], be, pg_offset, - pg_len, false); - pg_offset = round_down(pg_offset, SECTOR_SIZE); - pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) - - pg_offset; + extent_length = be.be_length - (isect - be.be_f_offset); } - - bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, - isect, pages[i], be, + pg_len = PAGE_CACHE_SIZE; + bio = do_add_page_to_bio(bio, header->page_array.npages - i, + WRITE, isect, pages[i], &map, &be, bl_end_io_write, par, - pg_offset, pg_len); + 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } - offset += saved_len; - count -= saved_len; - isect += PAGE_CACHE_SECTORS; - last_isect = isect; - extent_length -= PAGE_CACHE_SECTORS; - } - /* Last page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - bio = bl_submit_bio(WRITE, bio); - temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; - npg_zero = npg_per_block - do_div(temp, npg_per_block); - if (npg_zero < npg_per_block) { - last = 1; - goto fill_invalid_ext; - } + offset += pg_len; + count -= pg_len; + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= (pg_len >> SECTOR_SHIFT); } -write_done: - wdata->res.count = wdata->args.count; + header->res.count = header->args.count; out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(WRITE, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; -out_mds: - bl_put_extent(be); - bl_put_extent(cow_read); - kfree(par); - return PNFS_NOT_ATTEMPTED; -} - -/* FIXME - range ignored */ -static void -release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) -{ - int i; - struct pnfs_block_extent *be; - - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - while (!list_empty(&bl->bl_extents[i])) { - be = list_first_entry(&bl->bl_extents[i], - struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - } - spin_unlock(&bl->bl_ext_lock); -} - -static void -release_inval_marks(struct pnfs_inval_markings *marks) -{ - struct pnfs_inval_tracking *pos, *temp; - struct pnfs_block_short_extent *se, *stemp; - - list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { - list_del(&pos->it_link); - kfree(pos); - } - - list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - } - return; } static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) { struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int err; dprintk("%s enter\n", __func__); - release_extents(bl, NULL); - release_inval_marks(&bl->bl_inval); + + err = ext_tree_remove(bl, true, 0, LLONG_MAX); + WARN_ON(err); + kfree(bl); } @@ -961,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, bl = kzalloc(sizeof(*bl), gfp_flags); if (!bl) return NULL; + + bl->bl_ext_rw = RB_ROOT; + bl->bl_ext_ro = RB_ROOT; spin_lock_init(&bl->bl_ext_lock); - INIT_LIST_HEAD(&bl->bl_extents[0]); - INIT_LIST_HEAD(&bl->bl_extents[1]); - INIT_LIST_HEAD(&bl->bl_commit); - INIT_LIST_HEAD(&bl->bl_committing); - bl->bl_count = 0; - bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; - BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; } @@ -978,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg) kfree(lseg); } -/* We pretty much ignore lseg, and store all data layout wide, so we - * can correctly merge. - */ -static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - struct pnfs_layout_segment *lseg; - int status; +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; - dprintk("%s enter\n", __func__); - lseg = kzalloc(sizeof(*lseg), gfp_flags); - if (!lseg) - return ERR_PTR(-ENOMEM); - status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); - if (status) { - /* We don't want to call the full-blown bl_free_lseg, - * since on error extents were not touched. - */ - kfree(lseg); - return ERR_PTR(status); +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; } - return lseg; + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; } -static void -bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) +static int decode_sector_number(__be32 **rp, sector_t *sp) { - dprintk("%s enter\n", __func__); - encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; } -static void -bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +static int +bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, + struct layout_verification *lv, struct list_head *extents, + gfp_t gfp_mask) { - struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + struct pnfs_block_extent *be; + struct nfs4_deviceid id; + int error; + __be32 *p; - dprintk("%s enter\n", __func__); - clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); -} + p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE); + if (!p) + return -EIO; -static void free_blk_mountid(struct block_mount_id *mid) -{ - if (mid) { - struct pnfs_block_dev *dev, *tmp; + be = kzalloc(sizeof(*be), GFP_NOFS); + if (!be) + return -ENOMEM; - /* No need to take bm_lock as we are last user freeing bm_devlist */ - list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { - list_del(&dev->bm_node); - bl_free_block_dev(dev); - } - kfree(mid); + memcpy(&id, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + error = -EIO; + be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + lo->plh_lc_cred, gfp_mask); + if (!be->be_device) + goto out_free_be; + + /* + * The next three values are read in as bytes, but stored in the + * extent structure in 512-byte granularity. + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_put_deviceid; + be->be_state = be32_to_cpup(p++); + + error = verify_extent(be, lv); + if (error) { + dprintk("%s: extent verification failed\n", __func__); + goto out_put_deviceid; } + + list_add_tail(&be->be_list, extents); + return 0; + +out_put_deviceid: + nfs4_put_deviceid_node(be->be_device); +out_free_be: + kfree(be); + return error; } -/* This is mostly copied from the filelayout_get_device_info function. - * It seems much of this should be at the generic pnfs level. - */ -static struct pnfs_block_dev * -nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, - struct nfs4_deviceid *d_id) +static struct pnfs_layout_segment * +bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, + gfp_t gfp_mask) { - struct pnfs_device *dev; - struct pnfs_block_dev *rv; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - int i, rc; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + struct pnfs_layout_segment *lseg; + struct xdr_buf buf; + struct xdr_stream xdr; + struct page *scratch; + int status, i; + uint32_t count; + __be32 *p; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + lseg = kzalloc(sizeof(*lseg), gfp_mask); + if (!lseg) + return ERR_PTR(-ENOMEM); + + status = -ENOMEM; + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, + lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + status = -EIO; + p = xdr_inline_decode(&xdr, 4); + if (unlikely(!p)) + goto out_free_scratch; + + count = be32_to_cpup(p++); + dprintk("%s: number of extents %d\n", __func__, count); /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount + * Decode individual extents, putting them in temporary staging area + * until whole layout is decoded to make error recovery easier. */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s max_resp_sz %u max_pages %d\n", - __func__, max_resp_sz, max_pages); - - dev = kmalloc(sizeof(*dev), GFP_NOFS); - if (!dev) { - dprintk("%s kmalloc failed\n", __func__); - return ERR_PTR(-ENOMEM); + for (i = 0; i < count; i++) { + status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask); + if (status) + goto process_extents; } - pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - kfree(dev); - return ERR_PTR(-ENOMEM); + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + status = -EIO; + goto process_extents; } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) { - rv = ERR_PTR(-ENOMEM); - goto out_free; - } + + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + status = -EIO; } - memcpy(&dev->dev_id, d_id, sizeof(*d_id)); - dev->layout_type = LAYOUT_BLOCK_VOLUME; - dev->pages = pages; - dev->pgbase = 0; - dev->pglen = PAGE_SIZE * max_pages; - dev->mincount = 0; - dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); - rc = nfs4_proc_getdeviceinfo(server, dev, NULL); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) { - rv = ERR_PTR(rc); - goto out_free; +process_extents: + while (!list_empty(&extents)) { + struct pnfs_block_extent *be = + list_first_entry(&extents, struct pnfs_block_extent, + be_list); + list_del(&be->be_list); + + if (!status) + status = ext_tree_insert(bl, be); + + if (status) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } } - rv = nfs4_blk_decode_device(server, dev); - out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(dev); - return rv; +out_free_scratch: + __free_page(scratch); +out: + dprintk("%s returns %d\n", __func__, status); + if (status) { + kfree(lseg); + return ERR_PTR(status); + } + return lseg; } -static int -bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +static void +bl_return_range(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) { - struct block_mount_id *b_mt_id = NULL; - struct pnfs_devicelist *dlist = NULL; - struct pnfs_block_dev *bdev; - LIST_HEAD(block_disklist); - int status, i; - - dprintk("%s enter\n", __func__); + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + sector_t offset = range->offset >> SECTOR_SHIFT, end; - if (server->pnfs_blksize == 0) { - dprintk("%s Server did not return blksize\n", __func__); - return -EINVAL; - } - b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); - if (!b_mt_id) { - status = -ENOMEM; - goto out_error; - } - /* Initialize nfs4 block layout mount id */ - spin_lock_init(&b_mt_id->bm_lock); - INIT_LIST_HEAD(&b_mt_id->bm_devlist); - - dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); - if (!dlist) { - status = -ENOMEM; - goto out_error; + if (range->offset % 8) { + dprintk("%s: offset %lld not block size aligned\n", + __func__, range->offset); + return; } - dlist->eof = 0; - while (!dlist->eof) { - status = nfs4_proc_getdevicelist(server, fh, dlist); - if (status) - goto out_error; - dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", - __func__, dlist->num_devs, dlist->eof); - for (i = 0; i < dlist->num_devs; i++) { - bdev = nfs4_blk_get_deviceinfo(server, fh, - &dlist->dev_id[i]); - if (IS_ERR(bdev)) { - status = PTR_ERR(bdev); - goto out_error; - } - spin_lock(&b_mt_id->bm_lock); - list_add(&bdev->bm_node, &b_mt_id->bm_devlist); - spin_unlock(&b_mt_id->bm_lock); + + if (range->length != NFS4_MAX_UINT64) { + if (range->length % 8) { + dprintk("%s: length %lld not block size aligned\n", + __func__, range->length); + return; } - } - dprintk("%s SUCCESS\n", __func__); - server->pnfs_ld_data = b_mt_id; - out_return: - kfree(dlist); - return status; + end = offset + (range->length >> SECTOR_SHIFT); + } else { + end = round_down(NFS4_MAX_UINT64, PAGE_SIZE); + } - out_error: - free_blk_mountid(b_mt_id); - goto out_return; + ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); } static int -bl_clear_layoutdriver(struct nfs_server *server) +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) { - struct block_mount_id *b_mt_id = server->pnfs_ld_data; + return ext_tree_prepare_commit(arg); +} +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + ext_tree_mark_committed(&lcdata->args, lcdata->res.status); +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ dprintk("%s enter\n", __func__); - free_blk_mountid(b_mt_id); - dprintk("%s RETURNS\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + if (server->pnfs_blksize > PAGE_SIZE) { + printk(KERN_ERR "%s: pNFS blksize %d not supported.\n", + __func__, server->pnfs_blksize); + return -EINVAL; + } + return 0; } static bool -is_aligned_req(struct nfs_page *req, unsigned int alignment) +is_aligned_req(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, unsigned int alignment) { - return IS_ALIGNED(req->wb_offset, alignment) && - IS_ALIGNED(req->wb_bytes, alignment); + /* + * Always accept buffered writes, higher layers take care of the + * right alignment. + */ + if (pgio->pg_dreq == NULL) + return true; + + if (!IS_ALIGNED(req->wb_offset, alignment)) + return false; + + if (IS_ALIGNED(req->wb_bytes, alignment)) + return true; + + if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { + /* + * If the write goes up to the inode size, just write + * the full page. Data past the inode size is + * guaranteed to be zeroed by the higher level client + * code, and this behaviour is mandated by RFC 5663 + * section 2.3.2. + */ + return true; + } + + return false; } static void bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { nfs_pageio_reset_read_mds(pgio); - else - pnfs_generic_pg_init_read(pgio, req); + return; + } + + pnfs_generic_pg_init_read(pgio, req); } /* @@ -1197,10 +796,8 @@ static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1230,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) { + u64 wb_size; + + if (!is_aligned_req(pgio, req, PAGE_SIZE)) { nfs_pageio_reset_write_mds(pgio); - } else { - u64 wb_size; - if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); + return; } + + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); } /* @@ -1253,10 +851,8 @@ static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) + if (!is_aligned_req(pgio, req, PAGE_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1276,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, .free_layout_hdr = bl_free_layout_hdr, .alloc_lseg = bl_alloc_lseg, .free_lseg = bl_free_lseg, - .encode_layoutcommit = bl_encode_layoutcommit, + .return_range = bl_return_range, + .prepare_layoutcommit = bl_prepare_layoutcommit, .cleanup_layoutcommit = bl_cleanup_layoutcommit, .set_layoutdriver = bl_set_layoutdriver, - .clear_layoutdriver = bl_clear_layoutdriver, + .alloc_deviceid_node = bl_alloc_deviceid_node, + .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, }; -static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = rpc_pipe_generic_upcall, - .downcall = bl_pipe_downcall, - .destroy_msg = bl_pipe_destroy_msg, -}; - -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - struct dentry *dir, *dentry; - - dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); - if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); - dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); -} - -static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, - void *ptr) -{ - struct super_block *sb = ptr; - struct net *net = sb->s_fs_info; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return 0; - - if (nn->bl_device_pipe == NULL) { - module_put(THIS_MODULE); - return 0; - } - - switch (event) { - case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; - break; - case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); - break; - default: - ret = -ENOTSUPP; - break; - } - module_put(THIS_MODULE); - return ret; -} - -static struct notifier_block nfs4blocklayout_block = { - .notifier_call = rpc_pipefs_event, -}; - -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - struct dentry *dentry; - - pipefs_sb = rpc_get_sb_net(net); - if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - return dentry; -} - -static void nfs4blocklayout_unregister_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - - pipefs_sb = rpc_get_sb_net(net); - if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - } -} - -static int nfs4blocklayout_net_init(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - - init_waitqueue_head(&nn->bl_wq); - nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); - if (IS_ERR(nn->bl_device_pipe)) - return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; -} - -static void nfs4blocklayout_net_exit(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - - nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); - rpc_destroy_pipe_data(nn->bl_device_pipe); - nn->bl_device_pipe = NULL; -} - -static struct pernet_operations nfs4blocklayout_net_ops = { - .init = nfs4blocklayout_net_init, - .exit = nfs4blocklayout_net_exit, -}; - static int __init nfs4blocklayout_init(void) { int ret; @@ -1425,20 +899,14 @@ static int __init nfs4blocklayout_init(void) ret = pnfs_register_layoutdriver(&blocklayout_type); if (ret) goto out; - - ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + ret = bl_init_pipefs(); if (ret) - goto out_remove; - ret = register_pernet_subsys(&nfs4blocklayout_net_ops); - if (ret) - goto out_notifier; -out: - return ret; + goto out_unregister; + return 0; -out_notifier: - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); -out_remove: +out_unregister: pnfs_unregister_layoutdriver(&blocklayout_type); +out: return ret; } @@ -1447,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void) dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); - unregister_pernet_subsys(&nfs4blocklayout_net_ops); + bl_cleanup_pipefs(); pnfs_unregister_layoutdriver(&blocklayout_type); } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 9838fb020473..92dca9e90d8d 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -44,105 +44,112 @@ #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) #define SECTOR_SIZE (1 << SECTOR_SHIFT) -struct block_mount_id { - spinlock_t bm_lock; /* protects list */ - struct list_head bm_devlist; /* holds pnfs_block_dev */ -}; +struct pnfs_block_dev; -struct pnfs_block_dev { - struct list_head bm_node; - struct nfs4_deviceid bm_mdevid; /* associated devid */ - struct block_device *bm_mdev; /* meta device itself */ - struct net *net; +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +#define PNFS_BLOCK_MAX_UUIDS 4 +#define PNFS_BLOCK_MAX_DEVICES 64 + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + int len; + int nr_sigs; + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } sigs[PNFS_BLOCK_MAX_UUIDS]; + } simple; + struct { + u64 start; + u64 len; + u32 volume; + } slice; + struct { + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } concat; + struct { + u64 chunk_size; + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } stripe; + }; }; -#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ +struct pnfs_block_dev_map { + sector_t start; + sector_t len; -struct my_tree { - sector_t mtt_step_size; /* Internal sector alignment */ - struct list_head mtt_stub; /* Should be a radix tree */ + sector_t disk_offset; + struct block_device *bdev; }; -struct pnfs_inval_markings { - spinlock_t im_lock; - struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ - sector_t im_block_size; /* Server blocksize in sectors */ - struct list_head im_extents; /* Short extents for INVAL->RW conversion */ +struct pnfs_block_dev { + struct nfs4_deviceid_node node; + + u64 start; + u64 len; + + u32 nr_children; + struct pnfs_block_dev *children; + u64 chunk_size; + + struct block_device *bdev; + u64 disk_offset; + + bool (*map)(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map); }; -struct pnfs_inval_tracking { - struct list_head it_link; - int it_sector; - int it_tags; +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ }; /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { - struct kref be_refcnt; - struct list_head be_node; /* link into lseg list */ - struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ - struct block_device *be_mdev; + union { + struct rb_node be_node; + struct list_head be_list; + }; + struct nfs4_deviceid_node *be_device; sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ enum exstate4 be_state; /* the state of this extent */ - struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +#define EXTENT_WRITTEN 1 +#define EXTENT_COMMITTING 2 + unsigned int be_tag; }; -/* Shortened extent used by LAYOUTCOMMIT */ -struct pnfs_block_short_extent { - struct list_head bse_node; - struct nfs4_deviceid bse_devid; - struct block_device *bse_mdev; - sector_t bse_f_offset; /* the starting offset in the file */ - sector_t bse_length; /* the size of the extent */ -}; - -static inline void -BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) -{ - spin_lock_init(&marks->im_lock); - INIT_LIST_HEAD(&marks->im_tree.mtt_stub); - INIT_LIST_HEAD(&marks->im_extents); - marks->im_block_size = blocksize; - marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, - blocksize); -} - -enum extentclass4 { - RW_EXTENT = 0, /* READWRTE and INVAL */ - RO_EXTENT = 1, /* READ and NONE */ - EXTENT_LISTS = 2, -}; - -static inline int bl_choose_list(enum exstate4 state) -{ - if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) - return RO_EXTENT; - else - return RW_EXTENT; -} +/* on the wire size of the extent */ +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) struct pnfs_block_layout { - struct pnfs_layout_hdr bl_layout; - struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + struct pnfs_layout_hdr bl_layout; + struct rb_root bl_ext_rw; + struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ - struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ - struct list_head bl_commit; /* Needs layout commit */ - struct list_head bl_committing; /* Layout committing */ - unsigned int bl_count; /* entries in bl_commit */ - sector_t bl_blocksize; /* Server blocksize in sectors */ }; -#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) - static inline struct pnfs_block_layout * BLK_LO2EXT(struct pnfs_layout_hdr *lo) { @@ -171,41 +178,27 @@ struct bl_msg_hdr { #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ -/* blocklayoutdev.c */ -ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); -void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -void nfs4_blkdev_put(struct block_device *bdev); -struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev); -int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); - -/* blocklayoutdm.c */ -void bl_free_block_dev(struct pnfs_block_dev *bdev); - -/* extents.c */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read); -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length); -void bl_put_extent(struct pnfs_block_extent *be); -struct pnfs_block_extent *bl_alloc_extent(void); -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); -int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg); -void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status); -int bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new); -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new); -int bl_push_one_short_extent(struct pnfs_inval_markings *marks); -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks); -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); +/* dev.c */ +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_mask); +void bl_free_deviceid_node(struct nfs4_deviceid_node *d); + +/* extent_tree.c */ +int ext_tree_insert(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, + sector_t end); +int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len); +bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw); +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); +void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); + +/* rpc_pipefs.c */ +dev_t bl_resolve_deviceid(struct nfs_server *server, + struct pnfs_block_volume *b, gfp_t gfp_mask); +int __init bl_init_pipefs(void); +void __exit bl_cleanup_pipefs(void); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c deleted file mode 100644 index 04303b5c9361..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdev.c - * - * Device operations for the pnfs nfs4 file layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ -#include <linux/module.h> -#include <linux/buffer_head.h> /* __bread */ - -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static int decode_sector_number(__be32 **rp, sector_t *sp) -{ - uint64_t s; - - *rp = xdr_decode_hyper(*rp, &s); - if (s & 0x1ff) { - printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); - return -1; - } - *sp = s >> SECTOR_SHIFT; - return 0; -} - -/* - * Release the block device - */ -void nfs4_blkdev_put(struct block_device *bdev) -{ - dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - blkdev_put(bdev, FMODE_READ); -} - -ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, - size_t mlen) -{ - struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, - nfs_net_id); - - if (mlen != sizeof (struct bl_dev_msg)) - return -EINVAL; - - if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) - return -EFAULT; - - wake_up(&nn->bl_wq); - - return mlen; -} - -void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) -{ - struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); - - if (msg->errno >= 0) - return; - wake_up(bl_pipe_msg->bl_wq); -} - -/* - * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. - */ -struct pnfs_block_dev * -nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev) -{ - struct pnfs_block_dev *rv; - struct block_device *bd = NULL; - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_MOUNT, - .totallen = dev->mincount, - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - int offset, len, i, rc; - struct net *net = server->nfs_client->cl_net; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct bl_dev_msg *reply = &nn->bl_mount_reply; - - dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); - dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, - dev->mincount); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); - if (!msg->data) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - len = dev->mincount; - offset = sizeof(bl_msg); - for (i = 0; len > 0; i++) { - memcpy(&dataptr[offset], page_address(dev->pages[i]), - len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - offset += PAGE_CACHE_SIZE; - } - msg->len = sizeof(bl_msg) + dev->mincount; - - dprintk("%s CALLING USERSPACE DAEMON\n", __func__); - add_wait_queue(&nn->bl_wq, &wq); - rc = rpc_queue_upcall(nn->bl_device_pipe, msg); - if (rc < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - rv = ERR_PTR(rc); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - - if (reply->status != BL_DEVICE_REQUEST_PROC) { - dprintk("%s failed to open device: %d\n", - __func__, reply->status); - rv = ERR_PTR(-EINVAL); - goto out; - } - - bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), - FMODE_READ, NULL); - if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", __func__, - PTR_ERR(bd)); - rv = ERR_CAST(bd); - goto out; - } - - rv = kzalloc(sizeof(*rv), GFP_NOFS); - if (!rv) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - rv->bm_mdev = bd; - memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); - rv->net = net; - dprintk("%s Created device %s with bd_block_size %u\n", - __func__, - bd->bd_disk->disk_name, - bd->bd_block_size); - -out: - kfree(msg->data); - return rv; -} - -/* Map deviceid returned by the server to constructed block_device */ -static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, - struct nfs4_deviceid *id) -{ - struct block_device *rv = NULL; - struct block_mount_id *mid; - struct pnfs_block_dev *dev; - - dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); - mid = BLK_ID(lo); - spin_lock(&mid->bm_lock); - list_for_each_entry(dev, &mid->bm_devlist, bm_node) { - if (memcmp(id->data, dev->bm_mdevid.data, - NFS4_DEVICEID4_SIZE) == 0) { - rv = dev->bm_mdev; - goto out; - } - } - out: - spin_unlock(&mid->bm_lock); - dprintk("%s returning %p\n", __func__, rv); - return rv; -} - -/* Tracks info needed to ensure extents in layout obey constraints of spec */ -struct layout_verification { - u32 mode; /* R or RW */ - u64 start; /* Expected start of next non-COW extent */ - u64 inval; /* Start of INVAL coverage */ - u64 cowread; /* End of COW read coverage */ -}; - -/* Verify the extent meets the layout requirements of the pnfs-block draft, - * section 2.3.1. - */ -static int verify_extent(struct pnfs_block_extent *be, - struct layout_verification *lv) -{ - if (lv->mode == IOMODE_READ) { - if (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA) - return -EIO; - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } - /* lv->mode == IOMODE_RW */ - if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - if (lv->cowread > lv->start) - return -EIO; - lv->start += be->be_length; - lv->inval = lv->start; - return 0; - } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } else if (be->be_state == PNFS_BLOCK_READ_DATA) { - if (be->be_f_offset > lv->start) - return -EIO; - if (be->be_f_offset < lv->inval) - return -EIO; - if (be->be_f_offset < lv->cowread) - return -EIO; - /* It looks like you might want to min this with lv->start, - * but you really don't. - */ - lv->inval = lv->inval + be->be_length; - lv->cowread = be->be_f_offset + be->be_length; - return 0; - } else - return -EIO; -} - -/* XDR decode pnfs_block_layout4 structure */ -int -nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) -{ - struct pnfs_block_layout *bl = BLK_LO2EXT(lo); - int i, status = -EIO; - uint32_t count; - struct pnfs_block_extent *be = NULL, *save; - struct xdr_stream stream; - struct xdr_buf buf; - struct page *scratch; - __be32 *p; - struct layout_verification lv = { - .mode = lgr->range.iomode, - .start = lgr->range.offset >> SECTOR_SHIFT, - .inval = lgr->range.offset >> SECTOR_SHIFT, - .cowread = lgr->range.offset >> SECTOR_SHIFT, - }; - LIST_HEAD(extents); - - dprintk("---> %s\n", __func__); - - scratch = alloc_page(gfp_flags); - if (!scratch) - return -ENOMEM; - - xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err; - - count = be32_to_cpup(p++); - - dprintk("%s enter, number of extents %i\n", __func__, count); - p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); - if (unlikely(!p)) - goto out_err; - - /* Decode individual extents, putting them in temporary - * staging area until whole layout is decoded to make error - * recovery easier. - */ - for (i = 0; i < count; i++) { - be = bl_alloc_extent(); - if (!be) { - status = -ENOMEM; - goto out_err; - } - memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); - p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - be->be_mdev = translate_devid(lo, &be->be_devid); - if (!be->be_mdev) - goto out_err; - - /* The next three values are read in as bytes, - * but stored as 512-byte sector lengths - */ - if (decode_sector_number(&p, &be->be_f_offset) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_length) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_v_offset) < 0) - goto out_err; - be->be_state = be32_to_cpup(p++); - if (be->be_state == PNFS_BLOCK_INVALID_DATA) - be->be_inval = &bl->bl_inval; - if (verify_extent(be, &lv)) { - dprintk("%s verify failed\n", __func__); - goto out_err; - } - list_add_tail(&be->be_node, &extents); - } - if (lgr->range.offset + lgr->range.length != - lv.start << SECTOR_SHIFT) { - dprintk("%s Final length mismatch\n", __func__); - be = NULL; - goto out_err; - } - if (lv.start < lv.cowread) { - dprintk("%s Final uncovered COW extent\n", __func__); - be = NULL; - goto out_err; - } - /* Extents decoded properly, now try to merge them in to - * existing layout extents. - */ - spin_lock(&bl->bl_ext_lock); - list_for_each_entry_safe(be, save, &extents, be_node) { - list_del(&be->be_node); - status = bl_add_merge_extent(bl, be); - if (status) { - spin_unlock(&bl->bl_ext_lock); - /* This is a fairly catastrophic error, as the - * entire layout extent lists are now corrupted. - * We should have some way to distinguish this. - */ - be = NULL; - goto out_err; - } - } - spin_unlock(&bl->bl_ext_lock); - status = 0; - out: - __free_page(scratch); - dprintk("%s returns %i\n", __func__, status); - return status; - - out_err: - bl_put_extent(be); - while (!list_empty(&extents)) { - be = list_first_entry(&extents, struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - goto out; -} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c deleted file mode 100644 index 8999cfddd866..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdm.c - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2007 The Regents of the University of Michigan. - * All rights reserved. - * - * Fred Isaman <iisaman@umich.edu> - * Andy Adamson <andros@citi.umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include <linux/genhd.h> /* gendisk - used in a dprintk*/ -#include <linux/sched.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static void dev_remove(struct net *net, dev_t dev) -{ - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_dev_msg bl_umount_request; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_UMOUNT, - .totallen = sizeof(bl_umount_request), - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - struct nfs_net *nn = net_generic(net, nfs_net_id); - - dprintk("Entering %s\n", __func__); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; - msg->data = kzalloc(msg->len, GFP_NOFS); - if (!msg->data) - goto out; - - memset(&bl_umount_request, 0, sizeof(bl_umount_request)); - bl_umount_request.major = MAJOR(dev); - bl_umount_request.minor = MINOR(dev); - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - - add_wait_queue(&nn->bl_wq, &wq); - if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - -out: - kfree(msg->data); -} - -/* - * Release meta device - */ -static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) -{ - dprintk("%s Releasing\n", __func__); - nfs4_blkdev_put(bdev->bm_mdev); - dev_remove(bdev->net, bdev->bm_mdev->bd_dev); -} - -void bl_free_block_dev(struct pnfs_block_dev *bdev) -{ - if (bdev) { - if (bdev->bm_mdev) { - dprintk("%s Removing DM device: %d:%d\n", - __func__, - MAJOR(bdev->bm_mdev->bd_dev), - MINOR(bdev->bm_mdev->bd_dev)); - nfs4_blk_metadev_release(bdev); - } - kfree(bdev); - } -} diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c new file mode 100644 index 000000000000..5aed4f98df41 --- /dev/null +++ b/fs/nfs/blocklayout/dev.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/blkdev.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +bl_free_device(struct pnfs_block_dev *dev) +{ + if (dev->nr_children) { + int i; + + for (i = 0; i < dev->nr_children; i++) + bl_free_device(&dev->children[i]); + kfree(dev->children); + } else { + if (dev->bdev) + blkdev_put(dev->bdev, FMODE_READ); + } +} + +void +bl_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct pnfs_block_dev *dev = + container_of(d, struct pnfs_block_dev, node); + + bl_free_device(dev); + kfree(dev); +} + +static int +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int i; + + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->type = be32_to_cpup(p++); + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->simple.nr_sigs = be32_to_cpup(p++); + if (!b->simple.nr_sigs) { + dprintk("no signature\n"); + return -EIO; + } + + b->simple.len = 4 + 4; + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); + b->simple.sigs[i].sig_len = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); + if (!p) + return -EIO; + memcpy(&b->simple.sigs[i].sig, p, + b->simple.sigs[i].sig_len); + + b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + } + break; + case PNFS_BLOCK_VOLUME_SLICE: + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->slice.start); + p = xdr_decode_hyper(p, &b->slice.len); + b->slice.volume = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_CONCAT: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->concat.volumes_count; i++) + b->concat.volumes[i] = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); + b->stripe.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->stripe.volumes_count; i++) + b->stripe.volumes[i] = be32_to_cpup(p++); + break; + default: + dprintk("unknown volume type!\n"); + return -EIO; + } + + return 0; +} + +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + map->start = dev->start; + map->len = dev->len; + map->disk_offset = dev->disk_offset; + map->bdev = dev->bdev; + return true; +} + +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + int i; + + for (i = 0; i < dev->nr_children; i++) { + struct pnfs_block_dev *child = &dev->children[i]; + + if (child->start > offset || + child->start + child->len <= offset) + continue; + + child->map(child, offset - child->start, map); + return true; + } + + dprintk("%s: ran off loop!\n", __func__); + return false; +} + +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + struct pnfs_block_dev *child; + u64 chunk; + u32 chunk_idx; + u64 disk_offset; + + chunk = div_u64(offset, dev->chunk_size); + div_u64_rem(chunk, dev->nr_children, &chunk_idx); + + if (chunk_idx > dev->nr_children) { + dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", + __func__, chunk_idx, offset, dev->chunk_size); + /* error, should not happen */ + return false; + } + + /* truncate offset to the beginning of the stripe */ + offset = chunk * dev->chunk_size; + + /* disk offset of the stripe */ + disk_offset = div_u64(offset, dev->nr_children); + + child = &dev->children[chunk_idx]; + child->map(child, disk_offset, map); + + map->start += offset; + map->disk_offset += disk_offset; + map->len = dev->chunk_size; + return true; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); + + +static int +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + dev_t dev; + + dev = bl_resolve_deviceid(server, v, gfp_mask); + if (!dev) + return -EIO; + + d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(d->bdev)) { + printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", + MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); + return PTR_ERR(d->bdev); + } + + + d->len = i_size_read(d->bdev->bd_inode); + d->map = bl_map_simple; + + printk(KERN_INFO "pNFS: using block device %s\n", + d->bdev->bd_disk->disk_name); + return 0; +} + +static int +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + int ret; + + ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); + if (ret) + return ret; + + d->disk_offset = v->slice.start; + d->len = v->slice.len; + return 0; +} + +static int +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->concat.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->concat.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->concat.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + d->children[i].start += len; + len += d->children[i].len; + } + + d->len = len; + d->map = bl_map_concat; + return 0; +} + +static int +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->stripe.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->stripe.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->stripe.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + len += d->children[i].len; + } + + d->len = len; + d->chunk_size = v->stripe.chunk_size; + d->map = bl_map_stripe; + return 0; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + switch (volumes[idx].type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + return bl_parse_simple(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_SLICE: + return bl_parse_slice(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_CONCAT: + return bl_parse_concat(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_STRIPE: + return bl_parse_stripe(server, d, volumes, idx, gfp_mask); + default: + dprintk("unsupported volume type: %d\n", volumes[idx].type); + return -EIO; + } +} + +struct nfs4_deviceid_node * +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node = NULL; + struct pnfs_block_volume *volumes; + struct pnfs_block_dev *top; + struct xdr_stream xdr; + struct xdr_buf buf; + struct page *scratch; + int nr_volumes, ret, i; + __be32 *p; + + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&xdr, sizeof(__be32)); + if (!p) + goto out_free_scratch; + nr_volumes = be32_to_cpup(p++); + + volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), + gfp_mask); + if (!volumes) + goto out_free_scratch; + + for (i = 0; i < nr_volumes; i++) { + ret = nfs4_block_decode_volume(&xdr, &volumes[i]); + if (ret < 0) + goto out_free_volumes; + } + + top = kzalloc(sizeof(*top), gfp_mask); + if (!top) + goto out_free_volumes; + + ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); + if (ret) { + bl_free_device(top); + kfree(top); + goto out_free_volumes; + } + + node = &top->node; + nfs4_init_deviceid_node(node, server, &pdev->dev_id); + +out_free_volumes: + kfree(volumes); +out_free_scratch: + __free_page(scratch); +out: + return node; +} diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c new file mode 100644 index 000000000000..31d0b5e53dfd --- /dev/null +++ b/fs/nfs/blocklayout/extent_tree.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ + +#include <linux/vmalloc.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static inline struct pnfs_block_extent * +ext_node(struct rb_node *node) +{ + return rb_entry(node, struct pnfs_block_extent, be_node); +} + +static struct pnfs_block_extent * +ext_tree_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_prev(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_prev(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_next(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_next(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static inline sector_t +ext_f_end(struct pnfs_block_extent *be) +{ + return be->be_f_offset + be->be_length; +} + +static struct pnfs_block_extent * +__ext_tree_search(struct rb_root *root, sector_t start) +{ + struct rb_node *node = root->rb_node; + struct pnfs_block_extent *be = NULL; + + while (node) { + be = ext_node(node); + if (start < be->be_f_offset) + node = node->rb_left; + else if (start >= ext_f_end(be)) + node = node->rb_right; + else + return be; + } + + if (be) { + if (start < be->be_f_offset) + return be; + + if (start >= ext_f_end(be)) + return ext_tree_next(be); + } + + return NULL; +} + +static bool +ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) +{ + if (be1->be_state != be2->be_state) + return false; + if (be1->be_device != be2->be_device) + return false; + + if (be1->be_f_offset + be1->be_length != be2->be_f_offset) + return false; + + if (be1->be_state != PNFS_BLOCK_NONE_DATA && + (be1->be_v_offset + be1->be_length != be2->be_v_offset)) + return false; + + if (be1->be_state == PNFS_BLOCK_INVALID_DATA && + be1->be_tag != be2->be_tag) + return false; + + return true; +} + +static struct pnfs_block_extent * +ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + left->be_length += be->be_length; + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + return left; + } + + return be; +} + +static struct pnfs_block_extent * +ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + be->be_length += right->be_length; + rb_erase(&right->be_node, root); + nfs4_put_deviceid_node(right->be_device); + kfree(right); + } + + return be; +} + +static void +__ext_tree_insert(struct rb_root *root, + struct pnfs_block_extent *new, bool merge_ok) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + struct pnfs_block_extent *be; + + while (*p) { + parent = *p; + be = ext_node(parent); + + if (new->be_f_offset < be->be_f_offset) { + if (merge_ok && ext_can_merge(new, be)) { + be->be_f_offset = new->be_f_offset; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset = new->be_v_offset; + be->be_length += new->be_length; + be = ext_try_to_merge_left(root, be); + goto free_new; + } + p = &(*p)->rb_left; + } else if (new->be_f_offset >= ext_f_end(be)) { + if (merge_ok && ext_can_merge(be, new)) { + be->be_length += new->be_length; + be = ext_try_to_merge_right(root, be); + goto free_new; + } + p = &(*p)->rb_right; + } else { + BUG(); + } + } + + rb_link_node(&new->be_node, parent, p); + rb_insert_color(&new->be_node, root); + return; +free_new: + nfs4_put_deviceid_node(new->be_device); + kfree(new); +} + +static int +__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +{ + struct pnfs_block_extent *be; + sector_t len1 = 0, len2 = 0; + sector_t orig_v_offset; + sector_t orig_len; + + be = __ext_tree_search(root, start); + if (!be) + return 0; + if (be->be_f_offset >= end) + return 0; + + orig_v_offset = be->be_v_offset; + orig_len = be->be_length; + + if (start > be->be_f_offset) + len1 = start - be->be_f_offset; + if (ext_f_end(be) > end) + len2 = ext_f_end(be) - end; + + if (len2 > 0) { + if (len1 > 0) { + struct pnfs_block_extent *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = len1; + + new->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + new->be_v_offset = + orig_v_offset + orig_len - len2; + } + new->be_length = len2; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, true); + } else { + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + be->be_v_offset = + orig_v_offset + orig_len - len2; + } + be->be_length = len2; + } + } else { + if (len1 > 0) { + be->be_length = len1; + be = ext_tree_next(be); + } + + while (be && ext_f_end(be) <= end) { + struct pnfs_block_extent *next = ext_tree_next(be); + + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + be = next; + } + + if (be && be->be_f_offset < end) { + len1 = ext_f_end(be) - end; + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset += be->be_length - len1; + be->be_length = len1; + } + } + + return 0; +} + +int +ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be; + struct rb_root *root; + int err = 0; + + switch (new->be_state) { + case PNFS_BLOCK_READWRITE_DATA: + case PNFS_BLOCK_INVALID_DATA: + root = &bl->bl_ext_rw; + break; + case PNFS_BLOCK_READ_DATA: + case PNFS_BLOCK_NONE_DATA: + root = &bl->bl_ext_ro; + break; + default: + dprintk("invalid extent type\n"); + return -EINVAL; + } + + spin_lock(&bl->bl_ext_lock); +retry: + be = __ext_tree_search(root, new->be_f_offset); + if (!be || be->be_f_offset >= ext_f_end(new)) { + __ext_tree_insert(root, new, true); + } else if (new->be_f_offset >= be->be_f_offset) { + if (ext_f_end(new) <= ext_f_end(be)) { + nfs4_put_deviceid_node(new->be_device); + kfree(new); + } else { + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } + } else if (ext_f_end(new) <= ext_f_end(be)) { + new->be_length = be->be_f_offset - new->be_f_offset; + __ext_tree_insert(root, new, true); + } else { + struct pnfs_block_extent *split; + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + split = kmemdup(new, sizeof(*new), GFP_ATOMIC); + if (!split) { + err = -EINVAL; + goto out; + } + + split->be_length = be->be_f_offset - split->be_f_offset; + split->be_device = nfs4_get_deviceid(new->be_device); + __ext_tree_insert(root, split, true); + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static bool +__ext_tree_lookup(struct rb_root *root, sector_t isect, + struct pnfs_block_extent *ret) +{ + struct rb_node *node; + struct pnfs_block_extent *be; + + node = root->rb_node; + while (node) { + be = ext_node(node); + if (isect < be->be_f_offset) + node = node->rb_left; + else if (isect >= ext_f_end(be)) + node = node->rb_right; + else { + *ret = *be; + return true; + } + } + + return false; +} + +bool +ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw) +{ + bool found = false; + + spin_lock(&bl->bl_ext_lock); + if (!rw) + found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); + if (!found) + found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); + spin_unlock(&bl->bl_ext_lock); + + return found; +} + +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, + sector_t start, sector_t end) +{ + int err, err2; + + spin_lock(&bl->bl_ext_lock); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (rw) { + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + if (!err) + err = err2; + } + spin_unlock(&bl->bl_ext_lock); + + return err; +} + +static int +ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, + sector_t split) +{ + struct pnfs_block_extent *new; + sector_t orig_len = be->be_length; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = split - be->be_f_offset; + + new->be_f_offset = split; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + new->be_v_offset = be->be_v_offset + be->be_length; + new->be_length = orig_len - be->be_length; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, false); + return 0; +} + +int +ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len) +{ + struct rb_root *root = &bl->bl_ext_rw; + sector_t end = start + len; + struct pnfs_block_extent *be; + int err = 0; + + spin_lock(&bl->bl_ext_lock); + /* + * First remove all COW extents or holes from written to range. + */ + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (err) + goto out; + + /* + * Then mark all invalid extents in the range as written to. + */ + for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { + if (be->be_f_offset >= end) + break; + + if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) + continue; + + if (be->be_f_offset < start) { + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + sector_t diff = start - be->be_f_offset; + + left->be_length += diff; + + be->be_f_offset += diff; + be->be_v_offset += diff; + be->be_length -= diff; + } else { + err = ext_tree_split(root, be, start); + if (err) + goto out; + } + } + + if (ext_f_end(be) > end) { + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + sector_t diff = end - be->be_f_offset; + + be->be_length -= diff; + + right->be_f_offset -= diff; + right->be_v_offset -= diff; + right->be_length += diff; + } else { + err = ext_tree_split(root, be, end); + if (err) + goto out; + } + } + + if (be->be_f_offset >= start && ext_f_end(be) <= end) { + be->be_tag = EXTENT_WRITTEN; + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, + size_t buffer_size) +{ + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; + + for (i = 0; i < nr_pages; i++) + put_page(arg->layoutupdate_pages[i]); + kfree(arg->layoutupdate_pages); + } else { + put_page(arg->layoutupdate_page); + } +} + +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count) +{ + struct pnfs_block_extent *be; + int ret = 0; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (*count * BL_EXTENT_SIZE > buffer_size) { + /* keep counting.. */ + ret = -ENOSPC; + continue; + } + + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, + NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + + be->be_tag = EXTENT_COMMITTING; + } + spin_unlock(&bl->bl_ext_lock); + + return ret; +} + +int +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + size_t count = 0, buffer_size = PAGE_SIZE; + __be32 *start_p; + int ret; + + dprintk("%s enter\n", __func__); + + arg->layoutupdate_page = alloc_page(GFP_NOFS); + if (!arg->layoutupdate_page) + return -ENOMEM; + start_p = page_address(arg->layoutupdate_page); + arg->layoutupdate_pages = &arg->layoutupdate_page; + +retry: + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + if (unlikely(ret)) { + ext_tree_free_commitdata(arg, buffer_size); + + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + count = 0; + + arg->layoutupdate_pages = + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), + sizeof(struct page *), GFP_NOFS); + if (!arg->layoutupdate_pages) + return -ENOMEM; + + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + if (!start_p) { + kfree(arg->layoutupdate_pages); + return -ENOMEM; + } + + goto retry; + } + + *start_p = cpu_to_be32(count); + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { + __be32 *p = start_p; + int i = 0; + + for (p = start_p; + p < start_p + arg->layoutupdate_len; + p += PAGE_SIZE) { + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + } + } + + dprintk("%s found %zu ranges\n", __func__, count); + return 0; +} + +void +ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + struct rb_root *root = &bl->bl_ext_rw; + struct pnfs_block_extent *be; + + dprintk("%s status %d\n", __func__, status); + + ext_tree_free_commitdata(arg, arg->layoutupdate_len); + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_COMMITTING) + continue; + + if (status) { + /* + * Mark as written and try again. + * + * XXX: some real error handling here wouldn't hurt.. + */ + be->be_tag = EXTENT_WRITTEN; + } else { + be->be_state = PNFS_BLOCK_READWRITE_DATA; + be->be_tag = 0; + } + + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + spin_unlock(&bl->bl_ext_lock); +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c deleted file mode 100644 index 4d0161442565..000000000000 --- a/fs/nfs/blocklayout/extents.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayout.h - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include "blocklayout.h" -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* Bit numbers */ -#define EXTENT_INITIALIZED 0 -#define EXTENT_WRITTEN 1 -#define EXTENT_IN_COMMIT 2 -#define INTERNAL_EXISTS MY_MAX_TAGS -#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) - -/* Returns largest t<=s s.t. t%base==0 */ -static inline sector_t normalize(sector_t s, int base) -{ - sector_t tmp = s; /* Since do_div modifies its argument */ - return s - sector_div(tmp, base); -} - -static inline sector_t normalize_up(sector_t s, int base) -{ - return normalize(s + base - 1, base); -} - -/* Complete stub using list while determine API wanted */ - -/* Returns tags, or negative */ -static int32_t _find_entry(struct my_tree *tree, u64 s) -{ - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu) enter\n", __func__, s); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) - return pos->it_tags & INTERNAL_MASK; - else - break; - } - return -ENOENT; -} - -static inline -int _has_tag(struct my_tree *tree, u64 s, int32_t tag) -{ - int32_t tags; - - dprintk("%s(%llu, %i) enter\n", __func__, s, tag); - s = normalize(s, tree->mtt_step_size); - tags = _find_entry(tree, s); - if ((tags < 0) || !(tags & (1 << tag))) - return 0; - else - return 1; -} - -/* Creates entry with tag, or if entry already exists, unions tag to it. - * If storage is not NULL, newly created entry will use it. - * Returns number of entries added, or negative on error. - */ -static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, - struct pnfs_inval_tracking *storage) -{ - int found = 0; - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) { - found = 1; - break; - } else - break; - } - if (found) { - pos->it_tags |= (1 << tag); - return 0; - } else { - struct pnfs_inval_tracking *new; - new = storage; - new->it_sector = s; - new->it_tags = (1 << tag); - list_add(&new->it_link, &pos->it_link); - return 1; - } -} - -/* XXXX Really want option to not create */ -/* Over range, unions tag with existing entries, else creates entry with tag */ -static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) -{ - u64 i; - - dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); - for (i = normalize(s, tree->mtt_step_size); i < s + length; - i += tree->mtt_step_size) - if (_add_entry(tree, i, tag, NULL)) - return -ENOMEM; - return 0; -} - -/* Ensure that future operations on given range of tree will not malloc */ -static int _preload_range(struct pnfs_inval_markings *marks, - u64 offset, u64 length) -{ - u64 start, end, s; - int count, i, used = 0, status = -ENOMEM; - struct pnfs_inval_tracking **storage; - struct my_tree *tree = &marks->im_tree; - - dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); - start = normalize(offset, tree->mtt_step_size); - end = normalize_up(offset + length, tree->mtt_step_size); - count = (int)(end - start) / (int)tree->mtt_step_size; - - /* Pre-malloc what memory we might need */ - storage = kcalloc(count, sizeof(*storage), GFP_NOFS); - if (!storage) - return -ENOMEM; - for (i = 0; i < count; i++) { - storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), - GFP_NOFS); - if (!storage[i]) - goto out_cleanup; - } - - spin_lock_bh(&marks->im_lock); - for (s = start; s < end; s += tree->mtt_step_size) - used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); - spin_unlock_bh(&marks->im_lock); - - status = 0; - - out_cleanup: - for (i = used; i < count; i++) { - if (!storage[i]) - break; - kfree(storage[i]); - } - kfree(storage); - return status; -} - -/* We are relying on page lock to serialize this */ -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Assume start, end already sector aligned */ -static int -_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) -{ - struct pnfs_inval_tracking *pos; - u64 expect = 0; - - dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector >= end) - continue; - if (!expect) { - if ((pos->it_sector == end - tree->mtt_step_size) && - (pos->it_tags & (1 << tag))) { - expect = pos->it_sector - tree->mtt_step_size; - if (pos->it_sector < tree->mtt_step_size || expect < start) - return 1; - continue; - } else { - return 0; - } - } - if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) - return 0; - expect -= tree->mtt_step_size; - if (expect < start) - return 1; - } - return 0; -} - -static int is_range_written(struct pnfs_inval_markings *marks, - sector_t start, sector_t end) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Marks sectors in [offest, offset_length) as having been initialized. - * All lengths are step-aligned, where step is min(pagesize, blocksize). - * Currently assumes offset is page-aligned - */ -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - sector_t start, end; - - dprintk("%s(offset=%llu,len=%llu) enter\n", - __func__, (u64)offset, (u64)length); - - start = normalize(offset, marks->im_block_size); - end = normalize_up(offset + length, marks->im_block_size); - if (_preload_range(marks, start, end - start)) - goto outerr; - - spin_lock_bh(&marks->im_lock); - if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) - goto out_unlock; - spin_unlock_bh(&marks->im_lock); - - return 0; - -out_unlock: - spin_unlock_bh(&marks->im_lock); -outerr: - return -ENOMEM; -} - -/* Marks sectors in [offest, offset+length) as having been written to disk. - * All lengths should be block aligned. - */ -static int mark_written_sectors(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - int status; - - dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, - (u64)offset, (u64)length); - spin_lock_bh(&marks->im_lock); - status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); - spin_unlock_bh(&marks->im_lock); - return status; -} - -static void print_short_extent(struct pnfs_block_short_extent *be) -{ - dprintk("PRINT SHORT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); - dprintk(" be_length %llu\n", (u64)be->bse_length); - } -} - -static void print_clist(struct list_head *list, unsigned int count) -{ - struct pnfs_block_short_extent *be; - unsigned int i = 0; - - ifdebug(FACILITY) { - printk(KERN_DEBUG "****************\n"); - printk(KERN_DEBUG "Extent list looks like:\n"); - list_for_each_entry(be, list, bse_node) { - i++; - print_short_extent(be); - } - if (i != count) - printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); - printk(KERN_DEBUG "****************\n"); - } -} - -/* Note: In theory, we should do more checking that devid's match between - * old and new, but if they don't, the lists are too corrupt to salvage anyway. - */ -/* Note this is very similar to bl_add_merge_extent */ -static void add_to_commitlist(struct pnfs_block_layout *bl, - struct pnfs_block_short_extent *new) -{ - struct list_head *clist = &bl->bl_commit; - struct pnfs_block_short_extent *old, *save; - sector_t end = new->bse_f_offset + new->bse_length; - - dprintk("%s enter\n", __func__); - print_short_extent(new); - print_clist(clist, bl->bl_count); - bl->bl_count++; - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe(old, save, clist, bse_node) { - if (new->bse_f_offset < old->bse_f_offset) - break; - if (end <= old->bse_f_offset + old->bse_length) { - /* Range is already in list */ - bl->bl_count--; - kfree(new); - return; - } else if (new->bse_f_offset <= - old->bse_f_offset + old->bse_length) { - /* new overlaps or abuts existing be */ - if (new->bse_mdev == old->bse_mdev) { - /* extend new to fully replace old */ - new->bse_length += new->bse_f_offset - - old->bse_f_offset; - new->bse_f_offset = old->bse_f_offset; - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - } - /* Note that if we never hit the above break, old will not point to a - * valid extent. However, in that case &old->bse_node==list. - */ - list_add_tail(&new->bse_node, &old->bse_node); - /* Scan forward for overlaps. If we find any, extend new and - * remove the overlapped extent. - */ - old = list_prepare_entry(new, clist, bse_node); - list_for_each_entry_safe_continue(old, save, clist, bse_node) { - if (end < old->bse_f_offset) - break; - /* new overlaps or abuts old */ - if (new->bse_mdev == old->bse_mdev) { - if (end < old->bse_f_offset + old->bse_length) { - /* extend new to fully cover old */ - end = old->bse_f_offset + old->bse_length; - new->bse_length = end - new->bse_f_offset; - } - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - dprintk("%s: after merging\n", __func__); - print_clist(clist, bl->bl_count); -} - -/* Note the range described by offset, length is guaranteed to be contained - * within be. - * new will be freed, either by this function or add_to_commitlist if they - * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. - */ -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new) -{ - sector_t new_end, end = offset + length; - struct pnfs_block_layout *bl = container_of(be->be_inval, - struct pnfs_block_layout, - bl_inval); - - mark_written_sectors(be->be_inval, offset, length); - /* We want to add the range to commit list, but it must be - * block-normalized, and verified that the normalized range has - * been entirely written to disk. - */ - new->bse_f_offset = offset; - offset = normalize(offset, bl->bl_blocksize); - if (offset < new->bse_f_offset) { - if (is_range_written(be->be_inval, offset, new->bse_f_offset)) - new->bse_f_offset = offset; - else - new->bse_f_offset = offset + bl->bl_blocksize; - } - new_end = normalize_up(end, bl->bl_blocksize); - if (end < new_end) { - if (is_range_written(be->be_inval, end, new_end)) - end = new_end; - else - end = new_end - bl->bl_blocksize; - } - if (end <= new->bse_f_offset) { - kfree(new); - return 0; - } - new->bse_length = end - new->bse_f_offset; - new->bse_devid = be->be_devid; - new->bse_mdev = be->be_mdev; - - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, new); - spin_unlock(&bl->bl_ext_lock); - return 0; -} - -static void print_bl_extent(struct pnfs_block_extent *be) -{ - dprintk("PRINT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); - dprintk(" be_length %llu\n", (u64)be->be_length); - dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); - dprintk(" be_state %d\n", be->be_state); - } -} - -static void -destroy_extent(struct kref *kref) -{ - struct pnfs_block_extent *be; - - be = container_of(kref, struct pnfs_block_extent, be_refcnt); - dprintk("%s be=%p\n", __func__, be); - kfree(be); -} - -void -bl_put_extent(struct pnfs_block_extent *be) -{ - if (be) { - dprintk("%s enter %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_put(&be->be_refcnt, destroy_extent); - } -} - -struct pnfs_block_extent *bl_alloc_extent(void) -{ - struct pnfs_block_extent *be; - - be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); - if (!be) - return NULL; - INIT_LIST_HEAD(&be->be_node); - kref_init(&be->be_refcnt); - be->be_inval = NULL; - return be; -} - -static void print_elist(struct list_head *list) -{ - struct pnfs_block_extent *be; - dprintk("****************\n"); - dprintk("Extent list looks like:\n"); - list_for_each_entry(be, list, be_node) { - print_bl_extent(be); - } - dprintk("****************\n"); -} - -static inline int -extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) -{ - /* Note this assumes new->be_f_offset >= old->be_f_offset */ - return (new->be_state == old->be_state) && - ((new->be_state == PNFS_BLOCK_NONE_DATA) || - ((new->be_v_offset - old->be_v_offset == - new->be_f_offset - old->be_f_offset) && - new->be_mdev == old->be_mdev)); -} - -/* Adds new to appropriate list in bl, modifying new and removing existing - * extents as appropriate to deal with overlaps. - * - * See bl_find_get_extent for list constraints. - * - * Refcount on new is already set. If end up not using it, or error out, - * need to put the reference. - * - * bl->bl_ext_lock is held by caller. - */ -int -bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new) -{ - struct pnfs_block_extent *be, *tmp; - sector_t end = new->be_f_offset + new->be_length; - struct list_head *list; - - dprintk("%s enter with be=%p\n", __func__, new); - print_bl_extent(new); - list = &bl->bl_extents[bl_choose_list(new->be_state)]; - print_elist(list); - - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe_reverse(be, tmp, list, be_node) { - if (new->be_f_offset >= be->be_f_offset + be->be_length) - break; - if (new->be_f_offset >= be->be_f_offset) { - if (end <= be->be_f_offset + be->be_length) { - /* new is a subset of existing be*/ - if (extents_consistent(be, new)) { - dprintk("%s: new is subset, ignoring\n", - __func__); - bl_put_extent(new); - return 0; - } else { - goto out_err; - } - } else { - /* |<-- be -->| - * |<-- new -->| */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - new->be_length += new->be_f_offset - - be->be_f_offset; - new->be_f_offset = be->be_f_offset; - new->be_v_offset = be->be_v_offset; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } else if (end >= be->be_f_offset + be->be_length) { - /* new extent overlap existing be */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } else if (end > be->be_f_offset) { - /* |<-- be -->| - *|<-- new -->| */ - if (extents_consistent(new, be)) { - /* extend new to fully replace be */ - new->be_length += be->be_f_offset + be->be_length - - new->be_f_offset - new->be_length; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } - /* Note that if we never hit the above break, be will not point to a - * valid extent. However, in that case &be->be_node==list. - */ - list_add(&new->be_node, &be->be_node); - dprintk("%s: inserting new\n", __func__); - print_elist(list); - /* FIXME - The per-list consistency checks have all been done, - * should now check cross-list consistency. - */ - return 0; - - out_err: - bl_put_extent(new); - return -EIO; -} - -/* Returns extent, or NULL. If a second READ extent exists, it is returned - * in cow_read, if given. - * - * The extents are kept in two seperate ordered lists, one for READ and NONE, - * one for READWRITE and INVALID. Within each list, we assume: - * 1. Extents are ordered by file offset. - * 2. For any given isect, there is at most one extents that matches. - */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read) -{ - struct pnfs_block_extent *be, *cow, *ret; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - cow = ret = NULL; - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - if (!ret) - ret = be; - else if (be->be_state != PNFS_BLOCK_READ_DATA) - bl_put_extent(be); - else - cow = be; - break; - } - } - if (ret && - (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) - break; - } - spin_unlock(&bl->bl_ext_lock); - if (cow_read) - *cow_read = cow; - print_bl_extent(ret); - return ret; -} - -/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ -static struct pnfs_block_extent * -bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) -{ - struct pnfs_block_extent *be, *ret = NULL; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - for (i = 0; i < EXTENT_LISTS; i++) { - if (ret) - break; - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - ret = be; - break; - } - } - } - print_bl_extent(ret); - return ret; -} - -int -encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) -{ - struct pnfs_block_short_extent *lce, *save; - unsigned int count = 0; - __be32 *p, *xdr_start; - - dprintk("%s enter\n", __func__); - /* BUG - creation of bl_commit is buggy - need to wait for - * entire block to be marked WRITTEN before it can be added. - */ - spin_lock(&bl->bl_ext_lock); - /* Want to adjust for possible truncate */ - /* We now want to adjust argument range */ - - /* XDR encode the ranges found */ - xdr_start = xdr_reserve_space(xdr, 8); - if (!xdr_start) - goto out; - list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { - p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); - if (!p) - break; - p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); - p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); - p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); - p = xdr_encode_hyper(p, 0LL); - *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - list_move_tail(&lce->bse_node, &bl->bl_committing); - bl->bl_count--; - count++; - } - xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); - xdr_start[1] = cpu_to_be32(count); -out: - spin_unlock(&bl->bl_ext_lock); - dprintk("%s found %i ranges\n", __func__, count); - return 0; -} - -/* Helper function to set_to_rw that initialize a new extent */ -static void -_prep_new_extent(struct pnfs_block_extent *new, - struct pnfs_block_extent *orig, - sector_t offset, sector_t length, int state) -{ - kref_init(&new->be_refcnt); - /* don't need to INIT_LIST_HEAD(&new->be_node) */ - memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); - new->be_mdev = orig->be_mdev; - new->be_f_offset = offset; - new->be_length = length; - new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; - new->be_state = state; - new->be_inval = orig->be_inval; -} - -/* Tries to merge be with extent in front of it in list. - * Frees storage if not used. - */ -static struct pnfs_block_extent * -_front_merge(struct pnfs_block_extent *be, struct list_head *head, - struct pnfs_block_extent *storage) -{ - struct pnfs_block_extent *prev; - - if (!storage) - goto no_merge; - if (&be->be_node == head || be->be_node.prev == head) - goto no_merge; - prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); - if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || - !extents_consistent(prev, be)) - goto no_merge; - _prep_new_extent(storage, prev, prev->be_f_offset, - prev->be_length + be->be_length, prev->be_state); - list_replace(&prev->be_node, &storage->be_node); - bl_put_extent(prev); - list_del(&be->be_node); - bl_put_extent(be); - return storage; - - no_merge: - kfree(storage); - return be; -} - -static u64 -set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) -{ - u64 rv = offset + length; - struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; - struct pnfs_block_extent *children[3]; - struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; - int i = 0, j; - - dprintk("%s(%llu, %llu)\n", __func__, offset, length); - /* Create storage for up to three new extents e1, e2, e3 */ - e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); - e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); - e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); - /* BUG - we are ignoring any failure */ - if (!e1 || !e2 || !e3) - goto out_nosplit; - - spin_lock(&bl->bl_ext_lock); - be = bl_find_get_extent_locked(bl, offset); - rv = be->be_f_offset + be->be_length; - if (be->be_state != PNFS_BLOCK_INVALID_DATA) { - spin_unlock(&bl->bl_ext_lock); - goto out_nosplit; - } - /* Add e* to children, bumping e*'s krefs */ - if (be->be_f_offset != offset) { - _prep_new_extent(e1, be, be->be_f_offset, - offset - be->be_f_offset, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e1; - print_bl_extent(e1); - } else - merge1 = e1; - _prep_new_extent(e2, be, offset, - min(length, be->be_f_offset + be->be_length - offset), - PNFS_BLOCK_READWRITE_DATA); - children[i++] = e2; - print_bl_extent(e2); - if (offset + length < be->be_f_offset + be->be_length) { - _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, - be->be_f_offset + be->be_length - - offset - length, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e3; - print_bl_extent(e3); - } else - merge2 = e3; - - /* Remove be from list, and insert the e* */ - /* We don't get refs on e*, since this list is the base reference - * set when init'ed. - */ - if (i < 3) - children[i] = NULL; - new = children[0]; - list_replace(&be->be_node, &new->be_node); - bl_put_extent(be); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); - for (j = 1; j < i; j++) { - old = new; - new = children[j]; - list_add(&new->be_node, &old->be_node); - } - if (merge2) { - /* This is a HACK, should just create a _back_merge function */ - new = list_entry(new->be_node.next, - struct pnfs_block_extent, be_node); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); - } - spin_unlock(&bl->bl_ext_lock); - - /* Since we removed the base reference above, be is now scheduled for - * destruction. - */ - bl_put_extent(be); - dprintk("%s returns %llu after split\n", __func__, rv); - return rv; - - out_nosplit: - kfree(e1); - kfree(e2); - kfree(e3); - dprintk("%s returns %llu without splitting\n", __func__, rv); - return rv; -} - -void -clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status) -{ - struct pnfs_block_short_extent *lce, *save; - - dprintk("%s status %d\n", __func__, status); - list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { - if (likely(!status)) { - u64 offset = lce->bse_f_offset; - u64 end = offset + lce->bse_length; - - do { - offset = set_to_rw(bl, offset, end - offset); - } while (offset < end); - list_del(&lce->bse_node); - - kfree(lce); - } else { - list_del(&lce->bse_node); - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, lce); - spin_unlock(&bl->bl_ext_lock); - } - } -} - -int bl_push_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *new; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (unlikely(!new)) - return -ENOMEM; - - spin_lock_bh(&marks->im_lock); - list_add(&new->bse_node, &marks->im_extents); - spin_unlock_bh(&marks->im_lock); - - return 0; -} - -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *rv = NULL; - - spin_lock_bh(&marks->im_lock); - if (!list_empty(&marks->im_extents)) { - rv = list_entry((&marks->im_extents)->next, - struct pnfs_block_short_extent, bse_node); - list_del_init(&rv->bse_node); - } - spin_unlock_bh(&marks->im_lock); - - return rv; -} - -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) -{ - struct pnfs_block_short_extent *se = NULL, *tmp; - - if (num_to_free <= 0) - return; - - spin_lock(&marks->im_lock); - list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - if (--num_to_free == 0) - break; - } - spin_unlock(&marks->im_lock); - - BUG_ON(num_to_free > 0); -} diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c new file mode 100644 index 000000000000..8d04bda2bd2e --- /dev/null +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2006,2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) +{ + int i; + + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->simple.nr_sigs); + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_encode_hyper(p, b->simple.sigs[i].offset); + p = xdr_encode_opaque(p, b->simple.sigs[i].sig, + b->simple.sigs[i].sig_len); + } +} + +dev_t +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, + gfp_t gfp_mask) +{ + struct net *net = server->nfs_client->cl_net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_msg_hdr *bl_msg; + DECLARE_WAITQUEUE(wq, current); + dev_t dev = 0; + int rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + + bl_pipe_msg.bl_wq = &nn->bl_wq; + + b->simple.len += 4; /* single volume */ + if (b->simple.len > PAGE_SIZE) + return -EIO; + + memset(msg, 0, sizeof(*msg)); + msg->len = sizeof(*bl_msg) + b->simple.len; + msg->data = kzalloc(msg->len, gfp_mask); + if (!msg->data) + goto out; + + bl_msg = msg->data; + bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->totallen = b->simple.len; + nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); + if (rc < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nn->bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + printk(KERN_WARNING "%s failed to decode device: %d\n", + __func__, reply->status); + goto out; + } + + dev = MKDEV(reply->major, reply->minor); +out: + kfree(msg->data); + return dev; +} + +static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfs_net_id); + + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&nn->bl_wq); + + return mlen; +} + +static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct bl_pipe_msg *bl_pipe_msg = + container_of(msg, struct bl_pipe_msg, msg); + + if (msg->errno >= 0) + return; + wake_up(bl_pipe_msg->bl_wq); +} + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + +int __init bl_init_pipefs(void) +{ + int ret; + + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) + goto out; + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); + if (ret) + goto out_unregister_notifier; + return 0; + +out_unregister_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out: + return ret; +} + +void __exit bl_cleanup_pipefs(void) +{ + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); +} diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 073b4cf67ed9..b8fb3a4ef649 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, cb_info->serv = serv; cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, + cb_info->task = kthread_create(callback_svc, cb_info->rqst, "nfsv4.%u-svc", minorversion); if (IS_ERR(cb_info->task)) { ret = PTR_ERR(cb_info->task); @@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, cb_info->task = NULL; return ret; } + rqstp->rq_task = cb_info->task; + wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } @@ -428,6 +430,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) if (p == NULL) return 0; + /* + * Did we get the acceptor from userland during the SETCLIENID + * negotiation? + */ + if (clp->cl_acceptor) + return !strcmp(p, clp->cl_acceptor); + + /* + * Otherwise try to verify it using the cl_hostname. Note that this + * doesn't work if a non-canonical hostname was used in the devname. + */ + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ if (memcmp(p, "nfs@", 4) != 0) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 41db5258e7a7..73466b934090 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; ino = lo->plh_inode; + + spin_lock(&ino->i_lock); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) + &args->cbl_range)) { rv = NFS4ERR_DELAY; - else - rv = NFS4ERR_NOMATCHING_LAYOUT; - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + goto unlock; + } + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); + } +unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); @@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, } found: - if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) - dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " - "deleting instead\n", __func__); nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1d09289c8f0e..f9f4845db989 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version) mutex_unlock(&nfs_version_mutex); } - if (!IS_ERR(nfs)) - try_module_get(nfs->owner); + if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + return ERR_PTR(-EAGAIN); return nfs; } @@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) goto error_0; clp->cl_nfs_mod = cl_init->nfs_mod; - try_module_get(clp->cl_nfs_mod->owner); + if (!try_module_get(clp->cl_nfs_mod->owner)) + goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; @@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) error_cleanup: put_nfs_version(clp->cl_nfs_mod); +error_dealloc: kfree(clp); error_0: return ERR_PTR(err); @@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp) put_net(clp->cl_net); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); + kfree(clp->cl_acceptor); kfree(clp); dprintk("<-- nfs_free_client()\n"); @@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; + if (cl_init->hostname == NULL) { + WARN_ON(1); + return NULL; + } + dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname ?: "", rpc_ops->version); + cl_init->hostname, rpc_ops->version); /* see if the client already exists */ do { @@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, } while (!IS_ERR(new)); dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", - cl_init->hostname ?: "", PTR_ERR(new)); + cl_init->hostname, PTR_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); @@ -1205,7 +1213,7 @@ static const struct file_operations nfs_server_list_fops = { .open = nfs_server_list_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, .owner = THIS_MODULE, }; @@ -1226,7 +1234,7 @@ static const struct file_operations nfs_volume_list_fops = { .open = nfs_volume_list_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, .owner = THIS_MODULE, }; @@ -1236,27 +1244,17 @@ static const struct file_operations nfs_volume_list_fops = { */ static int nfs_server_list_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; - - ret = seq_open(file, &nfs_server_list_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = net; - - return 0; + return seq_open_net(inode, file, &nfs_server_list_ops, + sizeof(struct seq_net_private)); } /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); @@ -1268,7 +1266,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } @@ -1277,8 +1275,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } @@ -1289,7 +1288,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v) static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { @@ -1321,27 +1320,17 @@ static int nfs_server_list_show(struct seq_file *m, void *v) */ static int nfs_volume_list_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; - - ret = seq_open(file, &nfs_volume_list_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = net; - - return 0; + return seq_open_net(inode, file, &nfs_volume_list_ops, + sizeof(struct seq_net_private)); } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); @@ -1353,7 +1342,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } @@ -1362,8 +1351,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } @@ -1376,7 +1366,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) struct nfs_server *server; struct nfs_client *clp; char dev[8], fsid[17]; - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { @@ -1407,6 +1397,39 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) return 0; } +int nfs_fs_proc_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct proc_dir_entry *p; + + nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); + if (!nn->proc_nfsfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_create("servers", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_server_list_fops); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_create("volumes", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_volume_list_fops); + if (!p) + goto error_1; + return 0; + +error_1: + remove_proc_subtree("nfsfs", net->proc_net); +error_0: + return -ENOMEM; +} + +void nfs_fs_proc_net_exit(struct net *net) +{ + remove_proc_subtree("nfsfs", net->proc_net); +} + /* * initialise the /proc/fs/nfsfs/ directory */ @@ -1419,14 +1442,12 @@ int __init nfs_fs_proc_init(void) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - proc_fs_nfs, &nfs_server_list_fops); + p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - proc_fs_nfs, &nfs_volume_list_fops); + p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); if (!p) goto error_2; return 0; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5d8ccecf5f5c..5853f53db732 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } -/** - * nfs_have_delegation - check if inode has a delegation - * @inode: inode to check - * @flags: delegation types to check for - * - * Returns one if inode has the indicated delegation, otherwise zero. - */ -int nfs4_have_delegation(struct inode *inode, fmode_t flags) +static int +nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { struct nfs_delegation *delegation; int ret = 0; @@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL && (delegation->type & flags) == flags && !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { - nfs_mark_delegation_referenced(delegation); + if (mark) + nfs_mark_delegation_referenced(delegation); ret = 1; } rcu_read_unlock(); return ret; } +/** + * nfs_have_delegation - check if inode has a delegation, mark it + * NFS_DELEGATION_REFERENCED if there is one. + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs4_have_delegation(struct inode *inode, fmode_t flags) +{ + return nfs4_do_check_delegation(inode, flags, true); +} + +/* + * nfs4_check_delegation - check if inode has a delegation, do not mark + * NFS_DELEGATION_REFERENCED if it has one. + */ +int nfs4_check_delegation(struct inode *inode, fmode_t flags) +{ + return nfs4_do_check_delegation(inode, flags, false); +} static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 9a79c7a99d6d..5c1cce39297f 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); +int nfs4_check_delegation(struct inode *inode, fmode_t flags); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4a3d4ef76127..36d921f0c602 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy * and may need to be looked up again. + * If rcu_walk prevents us from performing a full check, return 0. */ -static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, + int rcu_walk) { + int ret; + if (IS_ROOT(dentry)) return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) @@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ - if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) + if (rcu_walk) + ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir); + else + ret = nfs_revalidate_inode(NFS_SERVER(dir), dir); + if (ret < 0) return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; @@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) out: return (inode->i_nlink == 0) ? -ENOENT : 0; out_force: + if (flags & LOOKUP_RCU) + return -ECHILD; ret = __nfs_revalidate_inode(server, inode); if (ret != 0) return ret; @@ -1054,6 +1064,9 @@ out_force: * * If parent mtime has changed, we revalidate, else we wait for a * period corresponding to the parent's attribute cache timeout value. + * + * If LOOKUP_RCU prevents us from performing a full check, return 1 + * suggesting a reval is needed. */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, @@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; - return !nfs_check_verifier(dir, dentry); + return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } /* @@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct nfs4_label *label = NULL; int error; - if (flags & LOOKUP_RCU) - return -ECHILD; - - parent = dget_parent(dentry); - dir = parent->d_inode; + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = ACCESS_ONCE(parent->d_inode); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = parent->d_inode; + } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; if (!inode) { - if (nfs_neg_need_reval(dir, dentry, flags)) + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; goto out_bad; + } goto out_valid_noent; } if (is_bad_inode(inode)) { + if (flags & LOOKUP_RCU) + return -ECHILD; dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; @@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto out_set_verifier; /* Force a full look up iff the parent directory has changed */ - if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { - if (nfs_lookup_verify_inode(inode, flags)) + if (!nfs_is_exclusive_create(dir, flags) && + nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { + + if (nfs_lookup_verify_inode(inode, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; goto out_zap_parent; + } goto out_valid; } + if (flags & LOOKUP_RCU) + return -ECHILD; + if (NFS_STALE(inode)) goto out_bad; @@ -1153,13 +1183,18 @@ out_set_verifier: /* Success: notify readdir to use READDIRPLUS */ nfs_advise_use_readdirplus(dir); out_valid_noent: - dput(parent); + if (flags & LOOKUP_RCU) { + if (parent != ACCESS_ONCE(dentry->d_parent)) + return -ECHILD; + } else + dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", __func__, dentry); return 1; out_zap_parent: nfs_zap_caches(dir); out_bad: + WARN_ON(flags & LOOKUP_RCU); nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); nfs4_label_free(label); @@ -1185,6 +1220,7 @@ out_zap_parent: __func__, dentry); return 0; out_error: + WARN_ON(flags & LOOKUP_RCU); nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); nfs4_label_free(label); @@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open); static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = NULL; struct inode *inode; - struct inode *dir; int ret = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto no_open; if (d_mountpoint(dentry)) @@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto no_open; inode = dentry->d_inode; - parent = dget_parent(dentry); - dir = parent->d_inode; /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ if (inode == NULL) { + struct dentry *parent; + struct inode *dir; + + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = ACCESS_ONCE(parent->d_inode); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = parent->d_inode; + } if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; + else if (flags & LOOKUP_RCU) + ret = -ECHILD; + if (!(flags & LOOKUP_RCU)) + dput(parent); + else if (parent != ACCESS_ONCE(dentry->d_parent)) + return -ECHILD; goto out; } /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open_dput; + goto no_open; /* We cannot do exclusive creation on a positive dentry */ if (flags & LOOKUP_EXCL) - goto no_open_dput; + goto no_open; /* Let f_op->open() actually open (and revalidate) the file */ ret = 1; out: - dput(parent); return ret; -no_open_dput: - dput(parent); no_open: return nfs_lookup_revalidate(dentry, flags); } @@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; +static unsigned long nfs_access_max_cachesize = ULONG_MAX; +module_param(nfs_access_max_cachesize, ulong, 0644); +MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); + static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); - kfree(entry); + kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); smp_mb__after_atomic(); @@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head) } } -unsigned long -nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +nfs_do_access_cache_scan(unsigned int nr_to_scan) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; - int nr_to_scan = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; long freed = 0; - if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) - return SHRINK_STOP; - spin_lock(&nfs_access_lru_lock); list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; @@ -2094,11 +2137,39 @@ remove_lru_entry: } unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return SHRINK_STOP; + return nfs_do_access_cache_scan(nr_to_scan); +} + + +unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); } +static void +nfs_access_cache_enforce_limit(void) +{ + long nr_entries = atomic_long_read(&nfs_access_nr_entries); + unsigned long diff; + unsigned int nr_to_scan; + + if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize) + return; + nr_to_scan = 100; + diff = nr_entries - nfs_access_max_cachesize; + if (diff < nr_to_scan) + nr_to_scan = diff; + nfs_do_access_cache_scan(nr_to_scan); +} + static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) { struct rb_root *root_node = &nfsi->access_cache; @@ -2186,6 +2257,38 @@ out_zap: return -ENOENT; } +static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +{ + /* Only check the most recently returned cache entry, + * but do it without locking. + */ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache; + int err = -ECHILD; + struct list_head *lh; + + rcu_read_lock(); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out; + lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + cache = list_entry(lh, struct nfs_access_entry, lru); + if (lh == &nfsi->access_cache_entry_lru || + cred != cache->cred) + cache = NULL; + if (cache == NULL) + goto out; + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + goto out; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + err = 0; +out: + rcu_read_unlock(); + return err; +} + static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) { struct nfs_inode *nfsi = NFS_I(inode); @@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) cache->cred = get_rpccred(set->cred); cache->mask = set->mask; + /* The above field assignments must be visible + * before this item appears on the lru. We cannot easily + * use rcu_assign_pointer, so just force the memory barrier. + */ + smp_wmb(); nfs_access_add_rbtree(inode, cache); /* Update accounting */ @@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) &nfs_access_lru_list); spin_unlock(&nfs_access_lru_lock); } + nfs_access_cache_enforce_limit(); } EXPORT_SYMBOL_GPL(nfs_access_add_cache); @@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) trace_nfs_access_enter(inode); - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached_rcu(inode, cred, &cache); + if (status != 0) + status = nfs_access_get_cached(inode, cred, &cache); if (status == 0) goto out_cached; + status = -ECHILD; + if (mask & MAY_NOT_BLOCK) + goto out; + /* Be clever: ask server to check for all possible rights */ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; cache.cred = cred; @@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask) struct rpc_cred *cred; int res = 0; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - nfs_inc_stats(inode, NFSIOS_VFSACCESS); if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) @@ -2350,12 +2462,23 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - cred = rpc_lookup_cred(); - if (!IS_ERR(cred)) { - res = nfs_do_access(inode, cred, mask); - put_rpccred(cred); - } else + /* Always try fast lookups first */ + rcu_read_lock(); + cred = rpc_lookup_cred_nonblock(); + if (!IS_ERR(cred)) + res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK); + else res = PTR_ERR(cred); + rcu_read_unlock(); + if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) { + /* Fast lookup failed, try the slow way */ + cred = rpc_lookup_cred(); + if (!IS_ERR(cred)) { + res = nfs_do_access(inode, cred, mask); + put_rpccred(cred); + } else + res = PTR_ERR(cred); + } out: if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) res = -EACCES; @@ -2364,6 +2487,9 @@ out: inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) res = generic_permission(inode, mask); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f11b9eed0de1..dda4b8667c02 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, { struct nfs_writeverf *verfp; - verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, - hdr->data->ds_idx); + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, + hdr->ds_idx); WARN_ON_ONCE(verfp->committed >= 0); memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); WARN_ON_ONCE(verfp->committed < 0); @@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, { struct nfs_writeverf *verfp; - verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, - hdr->data->ds_idx); + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, + hdr->ds_idx); if (verfp->committed < 0) { nfs_direct_set_hdr_verf(dreq, hdr); return 0; @@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) /* * nfs_direct_cmp_commit_data_verf - compare verifier for commit data * @dreq - direct request possibly spanning multiple servers @@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); } -#endif /** * nfs_direct_IO - NFS address space operation for direct I/O @@ -576,7 +574,6 @@ out: return result; } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; @@ -700,22 +697,11 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } -#else -static void nfs_direct_write_schedule_work(struct work_struct *work) -{ -} - -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) -{ - nfs_direct_complete(dreq, true); -} -#endif - static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - int bit = -1; + bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -729,27 +715,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) dreq->flags = 0; dreq->error = hdr->error; } - if (dreq->error != 0) - bit = NFS_IOHDR_ERROR; - else { + if (dreq->error == 0) { dreq->count += hdr->good_bytes; - if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - bit = NFS_IOHDR_NEED_RESCHED; - } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) - bit = NFS_IOHDR_NEED_RESCHED; + request_commit = true; else if (dreq->flags == 0) { nfs_direct_set_hdr_verf(dreq, hdr); - bit = NFS_IOHDR_NEED_COMMIT; + request_commit = true; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { + request_commit = true; + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - bit = NFS_IOHDR_NEED_RESCHED; - } else - bit = NFS_IOHDR_NEED_COMMIT; } } } @@ -759,9 +738,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); - switch (bit) { - case NFS_IOHDR_NEED_RESCHED: - case NFS_IOHDR_NEED_COMMIT: + if (request_commit) { kref_get(&req->wb_kref); nfs_mark_request_commit(req, hdr->lseg, &cinfo); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4042ff58fe3f..4ea92ce0537f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #include "nfstrace.h" @@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page, unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); unsigned int end = offset + len; + if (pnfs_ld_read_whole_page(file->f_mapping->host)) { + if (!PageUptodate(page)) + return 1; + return 0; + } + if ((file->f_mode & FMODE_READ) && /* open for read? */ !PageUptodate(page) && /* Uptodate? */ !PagePrivate(page) && /* i/o request already? */ @@ -361,8 +368,8 @@ start: * Prevent starvation issues if someone is doing a consistency * sync-to-disk */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; @@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not - * doing this memory reclaim for a fs-related allocation. + /* Always try to initiate a 'commit' if relevant, but only + * wait for it if __GFP_WAIT is set. Even then, only wait 1 + * second and only if the 'bdi' is not congested. + * Waiting indefinitely can cause deadlocks when the NFS + * server is on this machine, when a new TCP connection is + * needed and in other rare cases. There is no particular + * need to wait extensively here. A short wait has the + * benefit that someone else can worry about the freezer. */ - if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && - !(current->flags & PF_FSTRANS)) { - int how = FLUSH_SYNC; - - /* Don't let kswapd deadlock waiting for OOM RPC calls */ - if (current_is_kswapd()) - how = 0; - nfs_commit_inode(mapping->host, how); + if (mapping) { + struct nfs_server *nfss = NFS_SERVER(mapping->host); + nfs_commit_inode(mapping->host, 0); + if ((gfp & __GFP_WAIT) && + !bdi_write_congested(&nfss->backing_dev_info)) { + wait_on_page_bit_killable_timeout(page, PG_private, + HZ); + if (PagePrivate(page)) + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) @@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page) static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { + int ret; + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + *span = sis->pages; - return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); + + rcu_read_lock(); + ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); + rcu_read_unlock(); + + return ret; } static void nfs_swap_deactivate(struct file *file) { - xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + + rcu_read_lock(); + xs_swapper(rcu_dereference(clnt->cl_xprt), 0); + rcu_read_unlock(); } #endif @@ -891,17 +919,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) } EXPORT_SYMBOL_GPL(nfs_flock); -/* - * There is no protocol support for leases, so we have no way to implement - * them correctly in the face of opens by other clients. - */ -int nfs_setlease(struct file *file, long arg, struct file_lock **fl) -{ - dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg); - return -EINVAL; -} -EXPORT_SYMBOL_GPL(nfs_setlease); - const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read = new_sync_read, @@ -918,6 +935,6 @@ const struct file_operations nfs_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d2eba1c13b7e..abc5056999d6 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -static void filelayout_reset_write(struct nfs_pgio_data *data) +static void filelayout_reset_write(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - struct rpc_task *task = &data->task; + struct rpc_task *task = &hdr->task; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, - data->task.tk_pid, + hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), - data->args.count, - (unsigned long long)data->args.offset); + hdr->args.count, + (unsigned long long)hdr->args.offset); - task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + task->tk_status = pnfs_write_done_resend_to_mds(hdr); } } -static void filelayout_reset_read(struct nfs_pgio_data *data) +static void filelayout_reset_read(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - struct rpc_task *task = &data->task; + struct rpc_task *task = &hdr->task; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, - data->task.tk_pid, + hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), - data->args.count, - (unsigned long long)data->args.offset); + hdr->args.count, + (unsigned long long)hdr->args.offset); - task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + task->tk_status = pnfs_read_done_resend_to_mds(hdr); } } @@ -243,18 +235,17 @@ wait_on_recovery: /* NFS_PROTO call done callback routines */ static int filelayout_read_done_cb(struct rpc_task *task, - struct nfs_pgio_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; int err; - trace_nfs4_pnfs_read(data, task->tk_status); - err = filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, hdr->lseg); + trace_nfs4_pnfs_read(hdr, task->tk_status); + err = filelayout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg); switch (err) { case -NFS4ERR_RESET_TO_MDS: - filelayout_reset_read(data); + filelayout_reset_read(hdr); return task->tk_status; case -EAGAIN: rpc_restart_call_prepare(task); @@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task, * rfc5661 is not clear about which credential should be used. */ static void -filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) +filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = wdata->header; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - wdata->res.verf->committed == NFS_FILE_SYNC) + hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(wdata); + pnfs_set_layoutcommit(hdr); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_pgio_data *rdata = data; + struct nfs_pgio_header *hdr = data; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return; } - if (filelayout_reset_to_mds(rdata->header->lseg)) { + if (filelayout_reset_to_mds(hdr->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - filelayout_reset_read(rdata); + filelayout_reset_read(hdr); rpc_exit(task, 0); return; } - rdata->pgio_done_cb = filelayout_read_done_cb; + hdr->pgio_done_cb = filelayout_read_done_cb; - if (nfs41_setup_sequence(rdata->ds_clp->cl_session, - &rdata->args.seq_args, - &rdata->res.seq_res, + if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + &hdr->args.seq_args, + &hdr->res.seq_res, task)) return; - if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, - rdata->args.lock_context, FMODE_READ) == -EIO) + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_READ) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_pgio_data *rdata = data; + struct nfs_pgio_header *hdr = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { - nfs41_sequence_done(task, &rdata->res.seq_res); + nfs41_sequence_done(task, &hdr->res.seq_res); return; } /* Note this may cause RPC to be resent */ - rdata->header->mds_ops->rpc_call_done(task, data); + hdr->mds_ops->rpc_call_done(task, data); } static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_pgio_data *rdata = data; + struct nfs_pgio_header *hdr = data; - rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_pgio_data *rdata = data; - struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; + struct nfs_pgio_header *hdr = data; + struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); - nfs_put_client(rdata->ds_clp); - rdata->header->mds_ops->rpc_release(data); + nfs_put_client(hdr->ds_clp); + hdr->mds_ops->rpc_release(data); } static int filelayout_write_done_cb(struct rpc_task *task, - struct nfs_pgio_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; int err; - trace_nfs4_pnfs_write(data, task->tk_status); - err = filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, hdr->lseg); + trace_nfs4_pnfs_write(hdr, task->tk_status); + err = filelayout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg); switch (err) { case -NFS4ERR_RESET_TO_MDS: - filelayout_reset_write(data); + filelayout_reset_write(hdr); return task->tk_status; case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; } - filelayout_set_layoutcommit(data); + filelayout_set_layoutcommit(hdr); return 0; } @@ -414,62 +403,65 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } + if (data->verf.committed == NFS_UNSTABLE) + pnfs_commit_set_layoutcommit(data); + return 0; } static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_pgio_data *wdata = data; + struct nfs_pgio_header *hdr = data; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return; } - if (filelayout_reset_to_mds(wdata->header->lseg)) { + if (filelayout_reset_to_mds(hdr->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - filelayout_reset_write(wdata); + filelayout_reset_write(hdr); rpc_exit(task, 0); return; } - if (nfs41_setup_sequence(wdata->ds_clp->cl_session, - &wdata->args.seq_args, - &wdata->res.seq_res, + if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + &hdr->args.seq_args, + &hdr->res.seq_res, task)) return; - if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, - wdata->args.lock_context, FMODE_WRITE) == -EIO) + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_WRITE) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_pgio_data *wdata = data; + struct nfs_pgio_header *hdr = data; - if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { - nfs41_sequence_done(task, &wdata->res.seq_res); + nfs41_sequence_done(task, &hdr->res.seq_res); return; } /* Note this may cause RPC to be resent */ - wdata->header->mds_ops->rpc_call_done(task, data); + hdr->mds_ops->rpc_call_done(task, data); } static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_pgio_data *wdata = data; + struct nfs_pgio_header *hdr = data; - rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_pgio_data *wdata = data; - struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; + struct nfs_pgio_header *hdr = data; + struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); - nfs_put_client(wdata->ds_clp); - wdata->header->mds_ops->rpc_release(data); + nfs_put_client(hdr->ds_clp); + hdr->mds_ops->rpc_release(data); } static void filelayout_commit_prepare(struct rpc_task *task, void *data) @@ -529,19 +521,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = { }; static enum pnfs_try_status -filelayout_read_pagelist(struct nfs_pgio_data *data) +filelayout_read_pagelist(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; - loff_t offset = data->args.offset; + loff_t offset = hdr->args.offset; u32 j, idx; struct nfs_fh *fh; dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", __func__, hdr->inode->i_ino, - data->args.pgbase, (size_t)data->args.count, offset); + hdr->args.pgbase, (size_t)hdr->args.count, offset); /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); @@ -559,30 +550,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data) /* No multipath support. Use first DS */ atomic_inc(&ds->ds_clp->cl_count); - data->ds_clp = ds->ds_clp; - data->ds_idx = idx; + hdr->ds_clp = ds->ds_clp; + hdr->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) - data->args.fh = fh; + hdr->args.fh = fh; - data->args.offset = filelayout_get_dserver_offset(lseg, offset); - data->mds_offset = offset; + hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); + hdr->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_pgio(ds_clnt, data, + nfs_initiate_pgio(ds_clnt, hdr, &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } /* Perform async writes. */ static enum pnfs_try_status -filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) +filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) { - struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; - loff_t offset = data->args.offset; + loff_t offset = hdr->args.offset; u32 j, idx; struct nfs_fh *fh; @@ -598,21 +588,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) return PNFS_NOT_ATTEMPTED; dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", - __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, + __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); - data->pgio_done_cb = filelayout_write_done_cb; + hdr->pgio_done_cb = filelayout_write_done_cb; atomic_inc(&ds->ds_clp->cl_count); - data->ds_clp = ds->ds_clp; - data->ds_idx = idx; + hdr->ds_clp = ds->ds_clp; + hdr->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) - data->args.fh = fh; - - data->args.offset = filelayout_get_dserver_offset(lseg, offset); + hdr->args.fh = fh; + hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_pgio(ds_clnt, data, + nfs_initiate_pgio(ds_clnt, hdr, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; @@ -660,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, - NFS_SERVER(lo->plh_inode)->nfs_client, id); - if (d == NULL) { - dsaddr = filelayout_get_device_info(lo->plh_inode, id, - lo->plh_lc_cred, gfp_flags); - if (dsaddr == NULL) - goto out; - } else - dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id, + lo->plh_lc_cred, gfp_flags); + if (d == NULL) + goto out; + + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); /* Found deviceid is unavailable */ if (filelayout_test_devid_unavailable(&dsaddr->id_node)) - goto out_put; + goto out_put; fl->dsaddr = dsaddr; @@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. + * Note this is must be called holding the inode (/cinfo) lock */ static void filelayout_clear_request_commit(struct nfs_page *req, @@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req, { struct pnfs_layout_segment *freeme = NULL; - spin_lock(cinfo->lock); if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) goto out; cinfo->ds->nwritten--; @@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req, } out: nfs_request_remove_commit_list(req, cinfo); - spin_unlock(cinfo->lock); - pnfs_put_lseg(freeme); + pnfs_put_lseg_async(freeme); } -static struct list_head * -filelayout_choose_commit_list(struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) +static void +filelayout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) + { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; struct list_head *list; struct pnfs_commit_bucket *buckets; - if (fl->commit_through_mds) - return &cinfo->mds->list; + if (fl->commit_through_mds) { + list = &cinfo->mds->list; + spin_lock(cinfo->lock); + goto mds_commit; + } /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive @@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; - spin_unlock(cinfo->lock); - return list; -} - -static void -filelayout_mark_request_commit(struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) -{ - struct list_head *list; - list = filelayout_choose_commit_list(req, lseg, cinfo); - nfs_request_add_commit_list(req, list, cinfo); +mds_commit: + /* nfs_request_add_commit_list(). We need to add req to list without + * dropping cinfo lock. + */ + set_bit(PG_CLEAN, &(req)->wb_flags); + nfs_list_add_request(req, list); + cinfo->mds->ncommit++; + spin_unlock(cinfo->lock); + if (!cinfo->dreq) { + inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + BDI_RECLAIMABLE); + __mark_inode_dirty(req->wb_context->dentry->d_inode, + I_DIRTY_DATASYNC); + } } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -1244,15 +1236,64 @@ restart: spin_unlock(cinfo->lock); } +/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest + * for @page + * @cinfo - commit info for current inode + * @page - page to search for matching head request + * + * Returns a the head request if one is found, otherwise returns NULL. + */ +static struct nfs_page * +filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) +{ + struct nfs_page *freq, *t; + struct pnfs_commit_bucket *b; + int i; + + /* Linearly search the commit lists for each bucket until a matching + * request is found */ + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + list_for_each_entry_safe(freq, t, &b->written, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + list_for_each_entry_safe(freq, t, &b->committing, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + } + + return NULL; +} + +static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_bucket *bucket; + struct pnfs_layout_segment *freeme; + int i; + + for (i = idx; i < fl_cinfo->nbuckets; i++) { + bucket = &fl_cinfo->buckets[i]; + if (list_empty(&bucket->committing)) + continue; + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); + spin_lock(cinfo->lock); + freeme = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + } +} + static unsigned int alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) { struct pnfs_ds_commit_info *fl_cinfo; struct pnfs_commit_bucket *bucket; struct nfs_commit_data *data; - int i, j; + int i; unsigned int nreq = 0; - struct pnfs_layout_segment *freeme; fl_cinfo = cinfo->ds; bucket = fl_cinfo->buckets; @@ -1272,16 +1313,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) } /* Clean up on error */ - for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { - if (list_empty(&bucket->committing)) - continue; - nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - spin_lock(cinfo->lock); - freeme = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); - pnfs_put_lseg(freeme); - } + filelayout_retry_commit(cinfo, i); /* Caller will clean up entries put on list */ return nreq; } @@ -1301,8 +1333,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, data->lseg = NULL; list_add(&data->pages, &list); nreq++; - } else + } else { nfs_retry_commit(mds_pages, NULL, cinfo); + filelayout_retry_commit(cinfo, 0); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + return -ENOMEM; + } } nreq += alloc_ds_commits(cinfo, &list); @@ -1332,6 +1368,17 @@ out: cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } +static struct nfs4_deviceid_node * +filelayout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + + dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} static void filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) @@ -1380,9 +1427,11 @@ static struct pnfs_layoutdriver_type filelayout_type = { .clear_request_commit = filelayout_clear_request_commit, .scan_commit_lists = filelayout_scan_commit_lists, .recover_commit_reqs = filelayout_recover_commit_reqs, + .search_commit_reqs = filelayout_search_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, + .alloc_deviceid_node = filelayout_alloc_deviceid_node, .free_deviceid_node = filelayout_free_deveiceid_node, }; diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index ffbddf2219ea..7c9f800c49d7 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx); + +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 44bf0140a4c7..9bb806a76d99 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -484,8 +484,9 @@ out_err: } /* Decode opaque device data and return the result */ -static struct nfs4_file_layout_dsaddr* -decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { int i; u32 cnt, num; @@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) dsaddr->stripe_indices = stripe_indices; stripe_indices = NULL; dsaddr->ds_num = num; - nfs4_init_deviceid_node(&dsaddr->id_node, - NFS_SERVER(ino)->pnfs_curr_ld, - NFS_SERVER(ino)->nfs_client, - &pdev->dev_id); + nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id); INIT_LIST_HEAD(&dsaddrs); @@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, + da = decode_ds_addr(server->nfs_client->cl_net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); @@ -637,102 +635,6 @@ out_err: return NULL; } -/* - * Decode the opaque device specified in 'dev' and add it to the cache of - * available devices. - */ -static struct nfs4_file_layout_dsaddr * -decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct nfs4_file_layout_dsaddr *n, *new; - - new = decode_device(inode, dev, gfp_flags); - if (!new) { - printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", - __func__); - return NULL; - } - - d = nfs4_insert_deviceid_node(&new->id_node); - n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - if (n != new) { - nfs4_fl_free_deviceid(new); - return n; - } - - return new; -} - -/* - * Retrieve the information for dev_id, add it to the list - * of available devices, and return it. - */ -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, - struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, - gfp_t gfp_flags) -{ - struct pnfs_device *pdev = NULL; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - struct nfs4_file_layout_dsaddr *dsaddr = NULL; - int rc, i; - struct nfs_server *server = NFS_SERVER(inode); - - /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount - */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s inode %p max_resp_sz %u max_pages %d\n", - __func__, inode, max_resp_sz, max_pages); - - pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); - if (pdev == NULL) - return NULL; - - pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); - if (pages == NULL) { - kfree(pdev); - return NULL; - } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(gfp_flags); - if (!pages[i]) - goto out_free; - } - - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); - pdev->layout_type = LAYOUT_NFSV4_1_FILES; - pdev->pages = pages; - pdev->pgbase = 0; - pdev->pglen = max_resp_sz; - pdev->mincount = 0; - pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - rc = nfs4_proc_getdeviceinfo(server, pdev, cred); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) - goto out_free; - - /* - * Found new device, need to decode it and then add it to the - * list of known devices for this mountpoint. - */ - dsaddr = decode_and_add_device(inode, pdev, gfp_flags); -out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(pdev); - dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); - return dsaddr; -} - void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { @@ -783,8 +685,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - nfs_wait_bit_killable, TASK_KILLABLE); + wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 7cf2c4699b08..777b055063f6 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data, struct nfs_server_key *key = buffer; uint16_t len = sizeof(struct nfs_server_key); + memset(key, 0, len); key->nfsversion = clp->rpc_ops->version; key->family = clp->cl_addr.ss_family; - memset(key, 0, len); - switch (clp->cl_addr.ss_family) { case AF_INET: key->port = sin->sin_port; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b94f80420a58..880618a8b048 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - ret = d_obtain_alias(inode); + ret = d_obtain_root(inode); if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); goto out; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9927913c97c2..141c9f4a40de 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(void *word) +int nfs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + BUG_ON(!S_ISREG(inode->i_mode)); + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } @@ -1002,6 +1004,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); +int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode) +{ + if (!(NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return -ECHILD; +} + static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1074,8 +1085,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) * the bit lock here if it looks like we're going to be doing that. */ for (;;) { - ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) goto out; spin_lock(&inode->i_lock); @@ -1840,11 +1851,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { nfs_clients_init(net); - return 0; + return nfs_fs_proc_net_init(net); } static void nfs_net_exit(struct net *net) { + nfs_fs_proc_net_exit(net); nfs_cleanup_cb_ident_idr(net); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f415cbf9f6c3..efaa31c70fbe 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); +extern int nfs_fs_proc_net_init(struct net *net); +extern void nfs_fs_proc_net_exit(struct net *net); #else +static inline int nfs_fs_proc_net_init(struct net *net) +{ + return 0; +} +static inline void nfs_fs_proc_net_exit(struct net *net) +{ +} static inline int nfs_fs_proc_init(void) { return 0; @@ -209,13 +218,6 @@ static inline void nfs_fs_proc_exit(void) int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); #endif -/* nfs3client.c */ -#if IS_ENABLED(CONFIG_NFS_V3) -struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); -struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *, rpc_authflavor_t); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -238,11 +240,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_io_counter *c); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; -struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); -void nfs_rw_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_release(struct nfs_pgio_data *); +struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); +void nfs_pgio_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); -int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *, const struct rpc_call_ops *, int, int); void nfs_free_request(struct nfs_page *req); @@ -337,7 +339,6 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); -int nfs_setlease(struct file *, long, struct file_lock **); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -348,7 +349,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(void *word); +extern int nfs_wait_bit_killable(struct wait_bit_key *key); /* super.c */ extern const struct super_operations nfs_sops; @@ -442,6 +443,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst, void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo); +int nfs_write_need_commit(struct nfs_pgio_header *); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -482,7 +484,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_pgio_data *); +extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 8ee1fab83268..ef221fb8a183 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,6 +29,9 @@ struct nfs_net { #endif spinlock_t nfs_client_lock; struct timespec boot_time; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfsfs; +#endif }; extern int nfs_net_id; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h new file mode 100644 index 000000000000..333ae4068506 --- /dev/null +++ b/fs/nfs/nfs3_fs.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014 Anna Schumaker. + * + * NFSv3-specific filesystem definitions and declarations + */ +#ifndef __LINUX_FS_NFS_NFS3_FS_H +#define __LINUX_FS_NFS_NFS3_FS_H + +/* + * nfs3acl.c + */ +#ifdef CONFIG_NFS_V3_ACL +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl); +extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); +extern const struct xattr_handler *nfs3_xattr_handlers[]; +#else +static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + return 0; +} +#define nfs3_listxattr NULL +#endif /* CONFIG_NFS_V3_ACL */ + +/* nfs3client.c */ +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); + + +#endif /* __LINUX_FS_NFS_NFS3_FS_H */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 8f854dde4150..658e586ca438 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -7,6 +7,7 @@ #include <linux/nfsacl.h> #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status; + int status = 0; + + if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL)) + goto out; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -256,7 +260,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, char *p = data + *result; acl = get_acl(inode, type); - if (!acl) + if (IS_ERR_OR_NULL(acl)) return 0; posix_acl_release(acl); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b3fc65ef39ca..8c1b437c5403 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -1,6 +1,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include "internal.h" +#include "nfs3_fs.h" #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index f0afa291fd58..524f9f837408 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -22,6 +22,7 @@ #include "iostat.h" #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -795,41 +796,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; nfs_invalidate_atime(inode); - nfs_refresh_inode(inode, &data->fattr); + nfs_refresh_inode(inode, &hdr->fattr); return 0; } -static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) { rpc_call_start(task); return 0; } -static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); return 0; } -static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index d6a98949af19..6af29c2da352 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <linux/nfs_fs.h> #include "internal.h" +#include "nfs3_fs.h" #include "nfs.h" static struct nfs_subversion nfs_v3 = { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ba2affa51941..a8b855ab4e22 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -54,7 +54,7 @@ struct nfs4_minor_version_ops { const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); - int (*free_lock_state)(struct nfs_server *, + void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; @@ -129,17 +129,6 @@ enum { * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) */ -struct nfs4_lock_owner { - unsigned int lo_type; -#define NFS4_ANY_LOCK_TYPE (0U) -#define NFS4_FLOCK_LOCK_TYPE (1U << 0) -#define NFS4_POSIX_LOCK_TYPE (1U << 1) - union { - fl_owner_t posix_owner; - pid_t flock_owner; - } lo_u; -}; - struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ @@ -149,7 +138,7 @@ struct nfs4_lock_state { struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; atomic_t ls_count; - struct nfs4_lock_owner ls_owner; + fl_owner_t ls_owner; }; /* bits for nfs4_state->flags */ @@ -337,11 +326,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, */ static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_pgio_data *wdata) + struct rpc_message *msg, struct nfs_pgio_header *hdr) { if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) - wdata->args.stable = NFS_FILE_SYNC; + hdr->args.stable = NFS_FILE_SYNC; } #else /* CONFIG_NFS_v4_1 */ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -369,7 +358,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_pgio_data *wdata) + struct rpc_message *msg, struct nfs_pgio_header *hdr) { } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index aa9ef4876046..ffdb28d86cf8 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ if (pos->cl_cons_state > NFS_CS_READY) { @@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (pos->cl_clientid != new->cl_clientid) continue; @@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION @@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (!nfs4_match_clientids(pos, new)) continue; @@ -855,6 +857,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, }; struct rpc_timeout ds_timeout; struct nfs_client *clp; + char buf[INET6_ADDRSTRLEN + 1]; + + if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + return ERR_PTR(-EINVAL); + cl_init.hostname = buf; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a816f0627a6c..3e987ad9ae25 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -131,5 +131,5 @@ const struct file_operations nfs4_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4bf3d97cc5a0..5aa55c132aa2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -77,7 +77,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); @@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static long nfs4_update_delay(long *timeout) +{ + long ret; + if (!timeout) + return NFS4_POLL_RETRY_MAX; + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + ret = *timeout; + *timeout <<= 1; + return ret; +} + static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) { int res = 0; might_sleep(); - if (*timeout <= 0) - *timeout = NFS4_POLL_RETRY_MIN; - if (*timeout > NFS4_POLL_RETRY_MAX) - *timeout = NFS4_POLL_RETRY_MAX; - freezable_schedule_timeout_killable_unsafe(*timeout); + freezable_schedule_timeout_killable_unsafe( + nfs4_update_delay(timeout)); if (fatal_signal_pending(current)) res = -ERESTARTSYS; - *timeout <<= 1; return res; } @@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) int ret = -EAGAIN; for (;;) { + spin_lock(&state->owner->so_lock); if (can_open_cached(state, fmode, open_mode)) { - spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); - goto out_return_state; - } + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); + goto out_return_state; } + spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); if (!can_open_delegated(delegation, fmode)) { @@ -1952,6 +1960,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) return status; } +/* + * Additional permission checks in order to distinguish between an + * open for read, and an open for execute. This works around the + * fact that NFSv4 OPEN treats read and execute permissions as being + * the same. + * Note that in the non-execute case, we want to turn off permission + * checking if we just created a new file (POSIX open() semantics). + */ static int nfs4_opendata_access(struct rpc_cred *cred, struct nfs4_opendata *opendata, struct nfs4_state *state, fmode_t fmode, @@ -1966,14 +1982,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred, return 0; mask = 0; - /* don't check MAY_WRITE - a newly created file may not have - * write mode bits, but POSIX allows the creating process to write. - * use openflags to check for exec, because fmode won't - * always have FMODE_EXEC set when file open for exec. */ + /* + * Use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. + */ if (openflags & __FMODE_EXEC) { /* ONLY check for exec rights */ mask = MAY_EXEC; - } else if (fmode & FMODE_READ) + } else if ((fmode & FMODE_READ) && !opendata->file_created) mask = MAY_READ; cache.cred = cred; @@ -2216,8 +2232,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); ret = _nfs4_proc_open(opendata); - if (ret != 0) + if (ret != 0) { + if (ret == -ENOENT) { + dentry = opendata->dentry; + if (dentry->d_inode) + d_delete(dentry); + else if (d_unhashed(dentry)) + d_add(dentry, NULL); + + nfs_set_verifier(dentry, + nfs_save_change_attribute(opendata->dir->d_inode)); + } goto out; + } state = nfs4_opendata_to_nfs4_state(opendata); ret = PTR_ERR(state); @@ -2545,6 +2572,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); + nfs4_stateid *res_stateid = NULL; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -2555,12 +2583,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) */ switch (task->tk_status) { case 0: - if (calldata->roc) + res_stateid = &calldata->res.stateid; + if (calldata->arg.fmode == 0 && calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - goto out_release; + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -2569,12 +2597,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); goto out_release; } } - nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); + nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -2586,6 +2614,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; dprintk("%s: begin!\n", __func__); @@ -2593,21 +2622,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); + is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); + is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); + is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); /* Calculate the change in open mode */ + calldata->arg.fmode = 0; if (state->n_rdwr == 0) { - if (state->n_rdonly == 0) { - call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); - calldata->arg.fmode &= ~FMODE_READ; - } - if (state->n_wronly == 0) { - call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); - calldata->arg.fmode &= ~FMODE_WRITE; - } - } + if (state->n_rdonly == 0) + call_close |= is_rdonly; + else if (is_rdonly) + calldata->arg.fmode |= FMODE_READ; + if (state->n_wronly == 0) + call_close |= is_wronly; + else if (is_wronly) + calldata->arg.fmode |= FMODE_WRITE; + } else if (is_rdwr) + calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; + + if (calldata->arg.fmode == 0) + call_close |= is_rdwr; + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -2647,6 +2682,48 @@ static const struct rpc_call_ops nfs4_close_ops = { .rpc_release = nfs4_free_closedata, }; +static bool nfs4_state_has_opener(struct nfs4_state *state) +{ + /* first check existing openers */ + if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 && + state->n_rdonly != 0) + return true; + + if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 && + state->n_wronly != 0) + return true; + + if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 && + state->n_rdwr != 0) + return true; + + return false; +} + +static bool nfs4_roc(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (nfs4_state_has_opener(state)) { + spin_unlock(&inode->i_lock); + return false; + } + } + spin_unlock(&inode->i_lock); + + if (nfs4_check_delegation(inode, FMODE_READ)) + return false; + + return pnfs_roc(inode); +} + /* * It is possible for data to be read/written from a mem-mapped file * after the sys_close call (which hits the vfs layer as a flush). @@ -2697,7 +2774,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->roc = pnfs_roc(state->inode); + calldata->roc = nfs4_roc(state->inode); nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; @@ -3148,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs4_label *label = NULL; int status; - if (pnfs_ld_layoutret_on_setattr(inode)) + if (pnfs_ld_layoutret_on_setattr(inode) && + sattr->ia_valid & ATTR_SIZE && + sattr->ia_size < i_size_read(inode)) pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); @@ -3507,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, + &data->timeout) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); return 1; @@ -3540,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; update_changeattr(old_dir, &res->old_cinfo); @@ -4033,24 +4113,26 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_pgio_data *data) +void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) { - nfs_invalidate_atime(data->header->inode); + nfs_invalidate_atime(hdr->inode); } -static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct nfs_server *server = NFS_SERVER(data->header->inode); + struct nfs_server *server = NFS_SERVER(hdr->inode); - trace_nfs4_read(data, task->tk_status); - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { + trace_nfs4_read(hdr, task->tk_status); + if (nfs4_async_handle_error(task, server, + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } - __nfs4_read_done_cb(data); + __nfs4_read_done_cb(hdr); if (task->tk_status > 0) - renew_lease(server, data->timestamp); + renew_lease(server, hdr->timestamp); return 0; } @@ -4068,54 +4150,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &data->res.seq_res)) + if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; - if (nfs4_read_stateid_changed(task, &data->args)) + if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; - return data->pgio_done_cb ? data->pgio_done_cb(task, data) : - nfs4_read_done_cb(task, data); + return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : + nfs4_read_done_cb(task, hdr); } -static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - data->timestamp = jiffies; - data->pgio_done_cb = nfs4_read_done_cb; + hdr->timestamp = jiffies; + hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); } -static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) { - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, + if (nfs4_setup_sequence(NFS_SERVER(hdr->inode), + &hdr->args.seq_args, + &hdr->res.seq_res, task)) return 0; - if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, + hdr->rw_ops->rw_mode) == -EIO) return -EIO; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) return -EIO; return 0; } -static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) { - struct inode *inode = data->header->inode; - - trace_nfs4_write(data, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { + struct inode *inode = hdr->inode; + + trace_nfs4_write(hdr, task->tk_status); + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { - renew_lease(NFS_SERVER(inode), data->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + renew_lease(NFS_SERVER(inode), hdr->timestamp); + nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); } return 0; } @@ -4134,23 +4222,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (!nfs4_sequence_done(task, &data->res.seq_res)) + if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; - if (nfs4_write_stateid_changed(task, &data->args)) + if (nfs4_write_stateid_changed(task, &hdr->args)) return -EAGAIN; - return data->pgio_done_cb ? data->pgio_done_cb(task, data) : - nfs4_write_done_cb(task, data); + return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : + nfs4_write_done_cb(task, hdr); } static -bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) +bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) { - const struct nfs_pgio_header *hdr = data->header; - /* Don't request attributes for pNFS or O_DIRECT writes */ - if (data->ds_clp != NULL || hdr->dreq != NULL) + if (hdr->ds_clp != NULL || hdr->dreq != NULL) return false; /* Otherwise, request attributes if and only if we don't hold * a delegation @@ -4158,23 +4244,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - struct nfs_server *server = NFS_SERVER(data->header->inode); + struct nfs_server *server = NFS_SERVER(hdr->inode); - if (!nfs4_write_need_cache_consistency_data(data)) { - data->args.bitmask = NULL; - data->res.fattr = NULL; + if (!nfs4_write_need_cache_consistency_data(hdr)) { + hdr->args.bitmask = NULL; + hdr->res.fattr = NULL; } else - data->args.bitmask = server->cache_consistency_bitmask; + hdr->args.bitmask = server->cache_consistency_bitmask; - if (!data->pgio_done_cb) - data->pgio_done_cb = nfs4_write_done_cb; - data->res.server = server; - data->timestamp = jiffies; + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_write_done_cb; + hdr->res.server = server; + hdr->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -4190,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da struct inode *inode = data->inode; trace_nfs4_commit(data, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4743,7 +4831,8 @@ out: static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, + struct nfs4_state *state, long *timeout) { struct nfs_client *clp = server->nfs_client; @@ -4793,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); + rpc_delay(task, nfs4_update_delay(timeout)); + goto restart_call; case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); case -NFS4ERR_RETRY_UNCACHED_REP: @@ -4881,6 +4972,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) return scnprintf(buf, len, "tcp"); } +static void nfs4_setclientid_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_setclientid *sc = calldata; + + if (task->tk_status == 0) + sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred); +} + +static const struct rpc_call_ops nfs4_setclientid_ops = { + .rpc_call_done = nfs4_setclientid_done, +}; + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4907,6 +5010,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; + struct rpc_task *task; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_setclientid_ops, + .callback_data = &setclientid, + .flags = RPC_TASK_TIMEOUT, + }; int status; /* nfs_client_id4 */ @@ -4933,7 +5044,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%.*s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, setclientid.sc_name_len, setclientid.sc_name); - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out; + } + status = task->tk_status; + if (setclientid.sc_cred) { + clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); + put_rpccred(setclientid.sc_cred); + } + rpc_put_task(task); +out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); return status; @@ -4975,6 +5097,9 @@ struct nfs4_delegreturndata { unsigned long timestamp; struct nfs_fattr fattr; int rpc_status; + struct inode *inode; + bool roc; + u32 roc_barrier; }; static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) @@ -4988,7 +5113,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: renew_lease(data->res.server, data->timestamp); - break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -4996,10 +5120,12 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: task->tk_status = 0; + if (data->roc) + pnfs_roc_set_barrier(data->inode, data->roc_barrier); break; default: - if (nfs4_async_handle_error(task, data->res.server, NULL) == - -EAGAIN) { + if (nfs4_async_handle_error(task, data->res.server, + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -5009,6 +5135,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) static void nfs4_delegreturn_release(void *calldata) { + struct nfs4_delegreturndata *data = calldata; + + if (data->roc) + pnfs_roc_release(data->inode); kfree(calldata); } @@ -5018,6 +5148,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; + if (d_data->roc && + pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) + return; + nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, &d_data->res.seq_res, @@ -5061,6 +5195,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; + data->inode = inode; + data->roc = list_empty(&NFS_I(inode)->open_files) ? + pnfs_roc(inode) : false; task_setup_data.callback_data = data; msg.rpc_argp = &data->args; @@ -5252,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, calldata->server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -5834,8 +5972,10 @@ struct nfs_release_lockowner_data { static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) { struct nfs_release_lockowner_data *data = calldata; - nfs40_setup_sequence(data->server, - &data->args.seq_args, &data->res.seq_res, task); + struct nfs_server *server = data->server; + nfs40_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, task); + data->args.lock_owner.clientid = server->nfs_client->cl_clientid; data->timestamp = jiffies; } @@ -5852,9 +5992,12 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) break; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); + break; case -NFS4ERR_LEASE_MOVED: case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } @@ -5872,7 +6015,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = { .rpc_release = nfs4_release_lockowner_release, }; -static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) +static void +nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) { struct nfs_release_lockowner_data *data; struct rpc_message msg = { @@ -5880,11 +6024,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st }; if (server->nfs_client->cl_mvops->minor_version != 0) - return -EINVAL; + return; data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) - return -ENOMEM; + return; data->lsp = lsp; data->server = server; data->args.lock_owner.clientid = server->nfs_client->cl_clientid; @@ -5895,7 +6039,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st msg.rpc_resp = &data->res; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); - return 0; } #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" @@ -7229,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr int ret = 0; if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) - return 0; + return -EAGAIN; task = _nfs41_proc_sequence(clp, cred, false); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -7459,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) } else { LIST_HEAD(head); + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); - /* Mark the bad layout state as invalid, then - * retry using the open stateid. */ pnfs_free_lseg_list(&head); + + task->tk_status = 0; + rpc_restart_call_prepare(task); } } - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); @@ -7626,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; rpc_restart_call_prepare(task); return; @@ -7685,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) return status; } -/* - * Retrieve the list of Data Server devices from the MDS. - */ -static int _nfs4_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_getdevicelist_args args = { - .fh = fh, - .layoutclass = server->pnfs_curr_ld->id, - }; - struct nfs4_getdevicelist_res res = { - .devlist = devlist, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], - .rpc_argp = &args, - .rpc_resp = &res, - }; - int status; - - dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, - &res.seq_res, 0); - dprintk("<-- %s status=%d\n", __func__, status); - return status; -} - -int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_exception exception = { }; - int err; - - do { - err = nfs4_handle_exception(server, - _nfs4_getdevicelist(server, fh, devlist), - &exception); - } while (exception.retry); - - dprintk("%s: err=%d, num_devs=%u\n", __func__, - err, devlist->num_devs); - - return err; -} -EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); - static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev, @@ -7805,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case 0: break; default: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -8101,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } @@ -8182,7 +8282,8 @@ static int nfs41_free_stateid(struct nfs_server *server, return ret; } -static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +static void +nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { struct rpc_task *task; struct rpc_cred *cred = lsp->ls_state->owner->so_cred; @@ -8190,9 +8291,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); if (IS_ERR(task)) - return PTR_ERR(task); + return; rpc_put_task(task); - return 0; } static bool nfs41_match_stateid(const nfs4_stateid *s1, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 1720d32ffa54..e1ba58c3d1ad 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work) } nfs_expire_all_delegations(clp); } else { + int ret; + /* Queue an asynchronous RENEW. */ - ops->sched_state_renewal(clp, cred, renew_flags); + ret = ops->sched_state_renewal(clp, cred, renew_flags); put_rpccred(cred); - goto out_exp; + switch (ret) { + default: + goto out_exp; + case -EAGAIN: + case -ENOMEM: + break; + } } } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 848f6853c59e..5194933ed419 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -787,21 +787,12 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + if (pos->ls_owner != fl_owner) continue; - switch (pos->ls_owner.lo_type) { - case NFS4_POSIX_LOCK_TYPE: - if (pos->ls_owner.lo_u.posix_owner != fl_owner) - continue; - break; - case NFS4_FLOCK_LOCK_TYPE: - if (pos->ls_owner.lo_u.flock_owner != fl_pid) - continue; - } atomic_inc(&pos->ls_count); return pos; } @@ -813,7 +804,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; @@ -824,17 +815,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f nfs4_init_seqid_counter(&lsp->ls_seqid); atomic_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner.lo_type = type; - switch (lsp->ls_owner.lo_type) { - case NFS4_FLOCK_LOCK_TYPE: - lsp->ls_owner.lo_u.flock_owner = fl_pid; - break; - case NFS4_POSIX_LOCK_TYPE: - lsp->ls_owner.lo_u.posix_owner = fl_owner; - break; - default: - goto out_free; - } + lsp->ls_owner = fl_owner; lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); if (lsp->ls_seqid.owner_id < 0) goto out_free; @@ -857,13 +838,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) { struct nfs4_lock_state *lsp, *new = NULL; for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner, pid, type); + lsp = __nfs4_find_lock_state(state, owner); if (lsp != NULL) break; if (new != NULL) { @@ -874,7 +855,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ break; } spin_unlock(&state->state_lock); - new = nfs4_alloc_lock_state(state, owner, pid, type); + new = nfs4_alloc_lock_state(state, owner); if (new == NULL) return NULL; } @@ -935,13 +916,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - if (fl->fl_flags & FL_POSIX) - lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); - else if (fl->fl_flags & FL_FLOCK) - lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid, - NFS4_FLOCK_LOCK_TYPE); - else - return -EINVAL; + lsp = nfs4_get_lock_state(state, fl->fl_owner); if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -955,7 +930,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, { struct nfs4_lock_state *lsp; fl_owner_t fl_owner; - pid_t fl_pid; int ret = -ENOENT; @@ -966,9 +940,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, goto out; fl_owner = lockowner->l_owner; - fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + lsp = __nfs4_find_lock_state(state, fl_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -1251,8 +1224,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); atomic_inc(&clp->cl_count); - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); if (res) goto out; if (clp->cl_cons_state < 0) @@ -1732,7 +1705,8 @@ restart: if (status < 0) { set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); - return nfs4_recovery_handle_error(clp, status); + status = nfs4_recovery_handle_error(clp, status); + return (status != 0) ? status : -EAGAIN; } nfs4_put_state_owner(sp); @@ -1741,7 +1715,7 @@ restart: spin_unlock(&clp->cl_lock); } rcu_read_unlock(); - return status; + return 0; } static int nfs4_check_lease(struct nfs_client *clp) @@ -1788,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) break; case -NFS4ERR_STALE_CLIENTID: clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_CLID_INUSE: @@ -2372,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; + continue; } if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { @@ -2393,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim reboot"; status = nfs4_do_reclaim(clp, clp->cl_mvops->reboot_recovery_ops); - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || - test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) - continue; - nfs4_state_end_reclaim_reboot(clp); - if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + if (status == -EAGAIN) continue; if (status < 0) goto out_error; + nfs4_state_end_reclaim_reboot(clp); } /* Now recover expired state... */ @@ -2408,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim nograce"; status = nfs4_do_reclaim(clp, clp->cl_mvops->nograce_recovery_ops); - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || - test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || - test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + if (status == -EAGAIN) continue; if (status < 0) goto out_error; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 0a744f3a86f6..1c32adbe728d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( - const struct nfs_pgio_data *data, + const struct nfs_pgio_header *hdr, int error ), - TP_ARGS(data, error), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event, ), TP_fast_assign( - const struct inode *inode = data->header->inode; + const struct inode *inode = hdr->inode; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __entry->offset = data->args.offset; - __entry->count = data->args.count; + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; __entry->error = error; ), @@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event, #define DEFINE_NFS4_READ_EVENT(name) \ DEFINE_EVENT(nfs4_read_event, name, \ TP_PROTO( \ - const struct nfs_pgio_data *data, \ + const struct nfs_pgio_header *hdr, \ int error \ ), \ - TP_ARGS(data, error)) + TP_ARGS(hdr, error)) DEFINE_NFS4_READ_EVENT(nfs4_read); #ifdef CONFIG_NFS_V4_1 DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); @@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); DECLARE_EVENT_CLASS(nfs4_write_event, TP_PROTO( - const struct nfs_pgio_data *data, + const struct nfs_pgio_header *hdr, int error ), - TP_ARGS(data, error), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event, ), TP_fast_assign( - const struct inode *inode = data->header->inode; + const struct inode *inode = hdr->inode; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __entry->offset = data->args.offset; - __entry->count = data->args.count; + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; __entry->error = error; ), @@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event, #define DEFINE_NFS4_WRITE_EVENT(name) \ DEFINE_EVENT(nfs4_write_event, name, \ TP_PROTO( \ - const struct nfs_pgio_data *data, \ + const struct nfs_pgio_header *hdr, \ int error \ ), \ - TP_ARGS(data, error)) + TP_ARGS(hdr, error)) DEFINE_NFS4_WRITE_EVENT(nfs4_write); #ifdef CONFIG_NFS_V4_1 DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 939ae606cfa4..005d03c5d274 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) -#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ - encode_verifier_maxsz) -#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ - 2 /* nfs_cookie4 gdlr_cookie */ + \ - decode_verifier_maxsz \ - /* verifier4 gdlr_verifier */ + \ - 1 /* gdlr_deviceid_list count */ + \ - XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ - NFS4_DEVICEID4_SIZE) \ - /* gdlr_deviceid_list */ + \ - 1 /* bool gdlr_eof */) -#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ - XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* layout type */ + \ + 1 /* maxcount */ + \ + 1 /* bitmap size */ + \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap, word 0 */) #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 1 /* layout type */ + \ 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap */) + 1 /* notification bitmap, word 0 */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ @@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int); 2 /* last byte written */ + \ 1 /* nt_timechanged (false) */ + \ 1 /* layoutupdate4 layout type */ + \ - 1 /* NULL filelayout layoutupdate4 payload */) + 1 /* layoutupdate4 opaqueue len */) + /* the actual content of layoutupdate4 should + be allocated by drivers and spliced in + using xdr_write_pages */ #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ @@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) -#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_getdevicelist_maxsz) -#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_getdevicelist_maxsz) #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz +\ encode_getdeviceinfo_maxsz) @@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void -encode_getdevicelist(struct xdr_stream *xdr, - const struct nfs4_getdevicelist_args *args, - struct compound_hdr *hdr) -{ - __be32 *p; - nfs4_verifier dummy = { - .data = "dummmmmy", - }; - - encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(args->layoutclass); - *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); - xdr_encode_hyper(p, 0ULL); /* cookie */ - encode_nfs4_verifier(xdr, &dummy); -} - -static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr, __be32 *p; encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); - p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4); p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ - *p++ = cpu_to_be32(0); /* bitmap length 0 */ + + p = reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); } static void @@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr, static int encode_layoutcommit(struct xdr_stream *xdr, struct inode *inode, - const struct nfs4_layoutcommit_args *args, + struct nfs4_layoutcommit_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( NFS_I(inode)->layout, xdr, args); - else - encode_uint32(xdr, 0); /* no layout-type payload */ + } else { + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) { + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); + } + } return 0; } @@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, } /* - * Encode GETDEVICELIST request - */ -static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_args *args) -{ - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - encode_getdevicelist(xdr, args, &hdr); - encode_nops(&hdr); -} - -/* * Encode GETDEVICEINFO request */ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -5765,54 +5726,6 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) -/* - * TODO: Need to handle case when EOF != true; - */ -static int decode_getdevicelist(struct xdr_stream *xdr, - struct pnfs_devicelist *res) -{ - __be32 *p; - int status, i; - nfs4_verifier verftemp; - - status = decode_op_hdr(xdr, OP_GETDEVICELIST); - if (status) - return status; - - p = xdr_inline_decode(xdr, 8 + 8 + 4); - if (unlikely(!p)) - goto out_overflow; - - /* TODO: Skip cookie for now */ - p += 2; - - /* Read verifier */ - p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); - - res->num_devs = be32_to_cpup(p); - - dprintk("%s: num_dev %d\n", __func__, res->num_devs); - - if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { - printk(KERN_ERR "NFS: %s too many result dev_num %u\n", - __func__, res->num_devs); - return -EIO; - } - - p = xdr_inline_decode(xdr, - res->num_devs * NFS4_DEVICEID4_SIZE + 4); - if (unlikely(!p)) - goto out_overflow; - for (i = 0; i < res->num_devs; i++) - p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, - NFS4_DEVICEID4_SIZE); - res->eof = be32_to_cpup(p); - return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -} - static int decode_getdeviceinfo(struct xdr_stream *xdr, struct pnfs_device *pdev) { @@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) goto out_overflow; - for (i = 0; i < len; i++, p++) { - if (be32_to_cpup(p)) { - dprintk("%s: notifications not supported\n", + + if (be32_to_cpup(p++) & + ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { + dprintk("%s: unsupported notification\n", + __func__); + } + + for (i = 1; i < len; i++) { + if (be32_to_cpup(p++)) { + dprintk("%s: unsupported notification\n", __func__); return -EIO; } @@ -7092,33 +7012,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, if (!status) status = decode_sequence(xdr, &res->seq_res, rqstp); if (!status) - status = decode_reclaim_complete(xdr, (void *)NULL); - return status; -} - -/* - * Decode GETDEVICELIST response - */ -static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_res *res) -{ - struct compound_hdr hdr; - int status; - - dprintk("encoding getdevicelist!\n"); - - status = decode_compound_hdr(xdr, &hdr); - if (status != 0) - goto out; - status = decode_sequence(xdr, &res->seq_res, rqstp); - if (status != 0) - goto out; - status = decode_putfh(xdr); - if (status != 0) - goto out; - status = decode_getdevicelist(xdr, res->devlist); -out: + status = decode_reclaim_complete(xdr, NULL); return status; } @@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), - PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 611320753db2..c6e4bda63000 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) kfree(de); } -static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de; - - d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - if (!d) - return NULL; - - de = container_of(d, struct objio_dev_ent, id_node); - return de; -} - -static struct objio_dev_ent * -_dev_list_add(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id, struct osd_dev *od, - gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); - struct objio_dev_ent *n; - - if (!de) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - return NULL; - } - - dprintk("%s: Adding od=%p\n", __func__, od); - nfs4_init_deviceid_node(&de->id_node, - nfss->pnfs_curr_ld, - nfss->nfs_client, - d_id); - de->od.od = od; - - d = nfs4_insert_deviceid_node(&de->id_node); - n = container_of(d, struct objio_dev_ent, id_node); - if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); - objio_free_deviceid_node(&de->id_node); - de = n; - } - - return de; -} - struct objio_segment { struct pnfs_layout_segment lseg; @@ -130,29 +84,24 @@ struct objio_state { /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, - gfp_t gfp_flags) +struct nfs4_deviceid_node * +objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode; + struct objio_dev_ent *ode = NULL; struct osd_dev *od; struct osd_dev_info odi; bool retry_flag = true; + __be32 *p; int err; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) { - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ - return 0; - } + deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); + if (!deviceaddr) + return NULL; - err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); - if (unlikely(err)) { - dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", - __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return err; - } + p = page_address(pdev->pages[0]); + pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { @@ -188,14 +137,24 @@ retry_lookup: goto out; } - ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, - gfp_flags); - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(d_id), _DEVID_HI(d_id)); + _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); + + ode = kzalloc(sizeof(*ode), gfp_flags); + if (!ode) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + goto out; + } + + nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); + kfree(deviceaddr); + + ode->od.od = od; + return &ode->id_node; + out: - objlayout_put_deviceinfo(deviceaddr); - return err; + kfree(deviceaddr); + return NULL; } static void copy_single_comp(struct ore_components *oc, unsigned c, @@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct xdr_stream *xdr, gfp_t gfp_flags) { + struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; @@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, objio_seg->oc.first_dev = layout.olo_comps_index; cur_comp = 0; while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + struct nfs4_deviceid_node *d; + struct objio_dev_ent *ode; + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, - &src_comp.oc_object_id.oid_device_id, - gfp_flags); - if (err) + + d = nfs4_find_get_deviceid(server, + &src_comp.oc_object_id.oid_device_id, + pnfslay->plh_lc_cred, gfp_flags); + if (!d) { + err = -ENXIO; goto err; - ++cur_comp; + } + + ode = container_of(d, struct objio_dev_ent, id_node); + objio_seg->oc.ods[cur_comp++] = &ode->od; } /* pnfs_osd_xdr_decode_layout_comp returns false on error */ if (unlikely(err)) @@ -439,22 +407,21 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -int objio_read_pagelist(struct nfs_pgio_data *rdata) +int objio_read_pagelist(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = rdata->header; struct objio_state *objios; int ret; ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, - hdr->lseg, rdata->args.pages, rdata->args.pgbase, - rdata->args.offset, rdata->args.count, rdata, + hdr->lseg, hdr->args.pages, hdr->args.pgbase, + hdr->args.offset, hdr->args.count, hdr, GFP_KERNEL, &objios); if (unlikely(ret)) return ret; objios->ios->done = _read_done; dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - rdata->args.offset, rdata->args.count); + hdr->args.offset, hdr->args.count); ret = ore_read(objios->ios); if (unlikely(ret)) objio_free_result(&objios->oir); @@ -487,11 +454,11 @@ static void _write_done(struct ore_io_state *ios, void *private) static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { struct objio_state *objios = priv; - struct nfs_pgio_data *wdata = objios->oir.rpcdata; - struct address_space *mapping = wdata->header->inode->i_mapping; + struct nfs_pgio_header *hdr = objios->oir.rpcdata; + struct address_space *mapping = hdr->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; struct page *page; - loff_t i_size = i_size_read(wdata->header->inode); + loff_t i_size = i_size_read(hdr->inode); if (offset >= i_size) { *uptodate = true; @@ -531,15 +498,14 @@ static const struct _ore_r4w_op _r4w_op = { .put_page = &__r4w_put_page, }; -int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) +int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) { - struct nfs_pgio_header *hdr = wdata->header; struct objio_state *objios; int ret; ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, - hdr->lseg, wdata->args.pages, wdata->args.pgbase, - wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + hdr->lseg, hdr->args.pages, hdr->args.pgbase, + hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, &objios); if (unlikely(ret)) return ret; @@ -551,7 +517,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) objios->ios->done = _write_done; dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - wdata->args.offset, wdata->args.count); + hdr->args.offset, hdr->args.count); ret = ore_write(objios->ios); if (unlikely(ret)) { objio_free_result(&objios->oir); @@ -655,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .max_deviceinfo_size = PAGE_SIZE, .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 765d3f54e986..c89357c7a914 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, static void _rpc_read_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_pgio_data *rdata; + struct nfs_pgio_header *hdr; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_pgio_data, task); + hdr = container_of(task, struct nfs_pgio_header, task); - pnfs_ld_read_done(rdata); + pnfs_ld_read_done(hdr); } void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_pgio_data *rdata = oir->rpcdata; + struct nfs_pgio_header *hdr = oir->rpcdata; - oir->status = rdata->task.tk_status = status; + oir->status = hdr->task.tk_status = status; if (status >= 0) - rdata->res.count = status; + hdr->res.count = status; else - rdata->header->pnfs_error = status; + hdr->pnfs_error = status; objlayout_iodone(oir); /* must not use oir after this point */ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, - status, rdata->res.eof, sync); + status, hdr->res.eof, sync); if (sync) - pnfs_ld_read_done(rdata); + pnfs_ld_read_done(hdr); else { - INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); - schedule_work(&rdata->task.u.tk_work); + INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); + schedule_work(&hdr->task.u.tk_work); } } @@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async reads. */ enum pnfs_try_status -objlayout_read_pagelist(struct nfs_pgio_data *rdata) +objlayout_read_pagelist(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; - loff_t offset = rdata->args.offset; - size_t count = rdata->args.count; + loff_t offset = hdr->args.offset; + size_t count = hdr->args.count; int err; loff_t eof; @@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata) if (unlikely(offset + count > eof)) { if (offset >= eof) { err = 0; - rdata->res.count = 0; - rdata->res.eof = 1; + hdr->res.count = 0; + hdr->res.eof = 1; /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } - rdata->res.eof = (offset + count) >= eof; - _fix_verify_io_params(hdr->lseg, &rdata->args.pages, - &rdata->args.pgbase, - rdata->args.offset, rdata->args.count); + hdr->res.eof = (offset + count) >= eof; + _fix_verify_io_params(hdr->lseg, &hdr->args.pages, + &hdr->args.pgbase, + hdr->args.offset, hdr->args.count); dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", - __func__, inode->i_ino, offset, count, rdata->res.eof); + __func__, inode->i_ino, offset, count, hdr->res.eof); - err = objio_read_pagelist(rdata); + err = objio_read_pagelist(hdr); out: if (unlikely(err)) { hdr->pnfs_error = err; @@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata) static void _rpc_write_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_pgio_data *wdata; + struct nfs_pgio_header *hdr; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_pgio_data, task); + hdr = container_of(task, struct nfs_pgio_header, task); - pnfs_ld_write_done(wdata); + pnfs_ld_write_done(hdr); } void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_pgio_data *wdata = oir->rpcdata; + struct nfs_pgio_header *hdr = oir->rpcdata; - oir->status = wdata->task.tk_status = status; + oir->status = hdr->task.tk_status = status; if (status >= 0) { - wdata->res.count = status; - wdata->verf.committed = oir->committed; + hdr->res.count = status; + hdr->verf.committed = oir->committed; } else { - wdata->header->pnfs_error = status; + hdr->pnfs_error = status; } objlayout_iodone(oir); /* must not use oir after this point */ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, - status, wdata->verf.committed, sync); + status, hdr->verf.committed, sync); if (sync) - pnfs_ld_write_done(wdata); + pnfs_ld_write_done(hdr); else { - INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); - schedule_work(&wdata->task.u.tk_work); + INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); + schedule_work(&hdr->task.u.tk_work); } } @@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async writes. */ enum pnfs_try_status -objlayout_write_pagelist(struct nfs_pgio_data *wdata, - int how) +objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) { - struct nfs_pgio_header *hdr = wdata->header; int err; - _fix_verify_io_params(hdr->lseg, &wdata->args.pages, - &wdata->args.pgbase, - wdata->args.offset, wdata->args.count); + _fix_verify_io_params(hdr->lseg, &hdr->args.pages, + &hdr->args.pgbase, + hdr->args.offset, hdr->args.count); - err = objio_write_pagelist(wdata, how); + err = objio_write_pagelist(hdr, how); if (unlikely(err)) { hdr->pnfs_error = err; dprintk("%s: Returned Error %d\n", __func__, err); @@ -577,76 +574,6 @@ loop_done: dprintk("%s: Return\n", __func__); } - -/* - * Get Device Info API for io engines - */ -struct objlayout_deviceinfo { - struct page *page; - struct pnfs_osd_deviceaddr da; /* This must be last */ -}; - -/* Initialize and call nfs_getdeviceinfo, then decode and return a - * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() - * should be called. - */ -int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags) -{ - struct objlayout_deviceinfo *odi; - struct pnfs_device pd; - struct page *page, **pages; - u32 *p; - int err; - - page = alloc_page(gfp_flags); - if (!page) - return -ENOMEM; - - pages = &page; - pd.pages = pages; - - memcpy(&pd.dev_id, d_id, sizeof(*d_id)); - pd.layout_type = LAYOUT_OSD2_OBJECTS; - pd.pages = &page; - pd.pgbase = 0; - pd.pglen = PAGE_SIZE; - pd.mincount = 0; - pd.maxcount = PAGE_SIZE; - - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, - pnfslay->plh_lc_cred); - dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); - if (err) - goto err_out; - - p = page_address(page); - odi = kzalloc(sizeof(*odi), gfp_flags); - if (!odi) { - err = -ENOMEM; - goto err_out; - } - pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); - odi->page = page; - *deviceaddr = &odi->da; - return 0; - -err_out: - __free_page(page); - return err; -} - -void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) -{ - struct objlayout_deviceinfo *odi = container_of(deviceaddr, - struct objlayout_deviceinfo, - da); - - __free_page(odi->page); - kfree(odi); -} - enum { OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 01e041029a6c..3a0828d57339 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg); */ extern void objio_free_result(struct objlayout_io_res *oir); -extern int objio_read_pagelist(struct nfs_pgio_data *rdata); -extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); +extern int objio_read_pagelist(struct nfs_pgio_header *rdata); +extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); /* * callback API @@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir, extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags); -extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); - /* * exported generic objects function vectors */ @@ -168,10 +163,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( extern void objlayout_free_lseg(struct pnfs_layout_segment *); extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_pgio_data *); + struct nfs_pgio_header *); extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_pgio_data *, + struct nfs_pgio_header *, int how); extern void objlayout_encode_layoutcommit( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 17fab89f6358..94e16ec88312 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -115,8 +115,8 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&c->flags); - } while (atomic_read(&c->io_count) != 0); + ret = nfs_wait_bit_killable(&q.key); + } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; } @@ -136,28 +136,52 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; -} - /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked + * @nonblock - if true don't block waiting for lock * * this lock must be held if modifying the page group list + * + * return 0 on success, < 0 on error: -EDELAY if nonblocking or the + * result from wait_on_bit_lock + * + * NOTE: calling with nonblock=false should always have set the + * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock + * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + */ +int +nfs_page_group_lock(struct nfs_page *req, bool nonblock) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + return 0; + + if (!nonblock) + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); + + return -EAGAIN; +} + +/* + * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it + * @req - a request in the group + * + * This is a blocking call to wait for the group lock to be cleared. */ void -nfs_page_group_lock(struct nfs_page *req) +nfs_page_group_lock_wait(struct nfs_page *req) { struct nfs_page *head = req->wb_head; WARN_ON_ONCE(head != head->wb_head); - wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - nfs_wait_bit_uninterruptible, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); } /* @@ -218,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req); + nfs_page_group_lock(req, false); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -435,9 +459,8 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - return wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_uninterruptible, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); } /* @@ -458,127 +481,84 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, return 0; } + /* + * Limit the request size so that we can still allocate a page array + * for it without upsetting the slab allocator. + */ + if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * + sizeof(struct page) > PAGE_SIZE) + return 0; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); -static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) -{ - return container_of(hdr, struct nfs_rw_header, header); -} - -/** - * nfs_rw_header_alloc - Allocate a header for a read or write - * @ops: Read or write function vector - */ -struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) { - struct nfs_rw_header *header = ops->rw_alloc_header(); - - if (header) { - struct nfs_pgio_header *hdr = &header->header; + struct nfs_pgio_header *hdr = ops->rw_alloc_header(); + if (hdr) { INIT_LIST_HEAD(&hdr->pages); spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); hdr->rw_ops = ops; } - return header; + return hdr; } -EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); +EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); /* - * nfs_rw_header_free - Free a read or write header + * nfs_pgio_header_free - Free a read or write header * @hdr: The header to free */ -void nfs_rw_header_free(struct nfs_pgio_header *hdr) +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) { - hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); + hdr->rw_ops->rw_free_header(hdr); } -EXPORT_SYMBOL_GPL(nfs_rw_header_free); +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** - * nfs_pgio_data_alloc - Allocate pageio data - * @hdr: The header making a request - * @pagecount: Number of pages to create - */ -static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) -{ - struct nfs_pgio_data *data, *prealloc; - - prealloc = &NFS_RW_HEADER(hdr)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -/** - * nfs_pgio_data_release - Properly free pageio data - * @data: The data to release + * nfs_pgio_data_destroy - make @hdr suitable for reuse + * + * Frees memory and releases refs from nfs_generic_pgio, so that it may + * be called again. + * + * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_release(struct nfs_pgio_data *data) +void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); - - put_nfs_open_context(data->args.context); - if (data->pages.pagevec != data->pages.page_array) - kfree(data->pages.pagevec); - if (data == &pageio_header->rpc_data) { - data->header = NULL; - data = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(data); + put_nfs_open_context(hdr->args.context); + if (hdr->page_array.pagevec != hdr->page_array.page_array) + kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_release); +EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call - * @data: The pageio data + * @hdr: The pageio hdr * @count: Number of bytes to read * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ -static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, +static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int count, unsigned int offset, int how, struct nfs_commit_info *cinfo) { - struct nfs_page *req = data->header->req; + struct nfs_page *req = hdr->req; /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ + * NB: take care not to mess about with hdr->commit et al. */ - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; + hdr->args.fh = NFS_FH(hdr->inode); + hdr->args.offset = req_offset(req) + offset; /* pnfs_set_layoutcommit needs this */ - data->mds_offset = data->args.offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - data->args.stable = NFS_UNSTABLE; + hdr->mds_offset = hdr->args.offset; + hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pages = hdr->page_array.pagevec; + hdr->args.count = count; + hdr->args.context = get_nfs_open_context(req->wb_context); + hdr->args.lock_context = req->wb_lock_context; + hdr->args.stable = NFS_UNSTABLE; switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { case 0: break; @@ -586,59 +566,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, if (nfs_reqs_to_commit(cinfo)) break; default: - data->args.stable = NFS_FILE_SYNC; + hdr->args.stable = NFS_FILE_SYNC; } - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); + hdr->res.fattr = &hdr->fattr; + hdr->res.count = count; + hdr->res.eof = 0; + hdr->res.verf = &hdr->verf; + nfs_fattr_init(&hdr->fattr); } /** - * nfs_pgio_prepare - Prepare pageio data to go over the wire + * nfs_pgio_prepare - Prepare pageio hdr to go over the wire * @task: The current task - * @calldata: pageio data to prepare + * @calldata: pageio header to prepare */ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) { - struct nfs_pgio_data *data = calldata; + struct nfs_pgio_header *hdr = calldata; int err; - err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); + err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr); if (err) rpc_exit(task, err); } -int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops, int how, int flags) { struct rpc_task *task; struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, + .rpc_argp = &hdr->args, + .rpc_resp = &hdr->res, + .rpc_cred = hdr->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, - .task = &data->task, + .task = &hdr->task, .rpc_message = &msg, .callback_ops = call_ops, - .callback_data = data, + .callback_data = hdr, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC | flags, }; int ret = 0; - data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how); dprintk("NFS: %5u initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", - data->task.tk_pid, - data->header->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(data->header->inode), - data->args.count, - (unsigned long long)data->args.offset); + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { @@ -665,22 +645,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_release(hdr->data); - hdr->data = NULL; + nfs_pgio_data_destroy(hdr); + hdr->completion_ops->completion(hdr); desc->pg_completion_ops->error_cleanup(&desc->pg_list); return -ENOMEM; } /** * nfs_pgio_release - Release pageio data - * @calldata: The pageio data to release + * @calldata: The pageio header to release */ static void nfs_pgio_release(void *calldata) { - struct nfs_pgio_data *data = calldata; - if (data->header->rw_ops->rw_release) - data->header->rw_ops->rw_release(data); - nfs_pgio_data_release(data); + struct nfs_pgio_header *hdr = calldata; + if (hdr->rw_ops->rw_release) + hdr->rw_ops->rw_release(hdr); + nfs_pgio_data_destroy(hdr); + hdr->completion_ops->completion(hdr); } /** @@ -721,22 +702,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling * @task: The task that ran - * @calldata: Pageio data to check + * @calldata: Pageio header to check */ static void nfs_pgio_result(struct rpc_task *task, void *calldata) { - struct nfs_pgio_data *data = calldata; - struct inode *inode = data->header->inode; + struct nfs_pgio_header *hdr = calldata; + struct inode *inode = hdr->inode; dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, task->tk_status); - if (data->header->rw_ops->rw_done(task, data, inode) != 0) + if (hdr->rw_ops->rw_done(task, hdr, inode) != 0) return; if (task->tk_status < 0) - nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); + nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset); else - data->header->rw_ops->rw_result(task, data); + hdr->rw_ops->rw_result(task, hdr); } /* @@ -751,32 +732,42 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { struct nfs_page *req; - struct page **pages; - struct nfs_pgio_data *data; + struct page **pages, + *last_page; struct list_head *head = &desc->pg_list; struct nfs_commit_info cinfo; + unsigned int pagecount, pageused; - data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) + pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) return nfs_pgio_error(desc, hdr); nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - pages = data->pages.pagevec; + pages = hdr->page_array.pagevec; + last_page = NULL; + pageused = 0; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; + + if (WARN_ON_ONCE(pageused >= pagecount)) + return nfs_pgio_error(desc, hdr); + + if (!last_page || last_page != req->wb_page) { + *pages++ = last_page = req->wb_page; + pageused++; + } } + if (WARN_ON_ONCE(pageused != pagecount)) + return nfs_pgio_error(desc, hdr); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); - hdr->data = data; + nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } @@ -784,25 +775,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { - struct nfs_rw_header *rw_hdr; struct nfs_pgio_header *hdr; int ret; - rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); - if (!rw_hdr) { + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); return -ENOMEM; } - hdr = &rw_hdr->header; - nfs_pgheader_init(desc, hdr, nfs_rw_header_free); - atomic_inc(&hdr->refcnt); + nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); if (ret == 0) ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), - hdr->data, desc->pg_rpc_callops, + hdr, desc->pg_rpc_callops, desc->pg_ioflags, 0); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); return ret; } @@ -845,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (req_offset(req) != req_offset(prev) + prev->wb_bytes) return false; + if (req->wb_page == prev->wb_page) { + if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) + return false; + } else { + if (req->wb_pgbase != 0 || + prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return false; + } } size = pgio->pg_ops->pg_test(pgio, prev, req); WARN_ON_ONCE(size > req->wb_bytes); @@ -916,7 +910,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, unsigned int bytes_left = 0; unsigned int offset, pgbase; - nfs_page_group_lock(req); + nfs_page_group_lock(req, false); subreq = req; bytes_left = subreq->wb_bytes; @@ -938,7 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ - nfs_page_group_lock(req); + nfs_page_group_lock(req, false); continue; } @@ -1013,7 +1007,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, } while (ret); return ret; } -EXPORT_SYMBOL_GPL(nfs_pageio_add_request); + +/* + * nfs_pageio_resend - Transfer requests to new descriptor and resend + * @hdr - the pgio header to move request from + * @desc - the pageio descriptor to add requests to + * + * Try to move each request (nfs_page) from @hdr to @desc then attempt + * to send them. + * + * Returns 0 on success and < 0 on error. + */ +int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + LIST_HEAD(failed); + + desc->pg_dreq = hdr->dreq; + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(desc, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(desc); + if (!list_empty(&failed)) { + list_move(&failed, &hdr->pages); + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(nfs_pageio_resend); /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor @@ -1029,7 +1054,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) break; } } -EXPORT_SYMBOL_GPL(nfs_pageio_complete); /** * nfs_pageio_cond_complete - Conditional I/O completion diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6fdcd233d6f7..76de7f568119 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); +static void pnfs_put_lseg_async_work(struct work_struct *work) +{ + struct pnfs_layout_segment *lseg; + + lseg = container_of(work, struct pnfs_layout_segment, pls_work); + + pnfs_put_lseg(lseg); +} + +void +pnfs_put_lseg_async(struct pnfs_layout_segment *lseg) +{ + INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work); + schedule_work(&lseg->pls_work); +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg_async); + static u64 end_offset(u64 start, u64 len) { @@ -577,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, dprintk("%s freeing layout for inode %lu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; + + pnfs_layoutcommit_inode(inode, false); + spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ @@ -665,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } -static void -pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new, - struct list_head *free_me_list) -{ - if (nfs4_stateid_match_other(&lo->plh_stateid, new)) - return; - /* Layout is new! Kill existing layout segments */ - pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -732,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; - } else if (list_empty(&lo->plh_segs)) { + } else if (list_empty(&lo->plh_segs) || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { int seq; do { @@ -847,6 +857,16 @@ _pnfs_return_layout(struct inode *ino) empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); + } + /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); @@ -854,6 +874,8 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout segments to return\n", __func__); goto out; } + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -1341,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } + init_lseg(lo, lseg); + lseg->pls_range = res->range; + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { dprintk("%s forget reply due to recall\n", __func__); goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, 1) || - pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (pnfs_layoutgets_blocked(lo, 1)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } - /* Check that the new stateid matches the old stateid */ - pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + /* existing state ID, make sure the sequence number matches. */ + if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to sequence\n", __func__); + goto out_forget_reply; + } + pnfs_set_layout_stateid(lo, &res->stateid, false); + } else { + /* + * We got an entirely new state ID. Mark all segments for the + * inode invalid, and don't bother validating the stateid + * sequence number. + */ + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + + nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); + lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + } + + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - init_lseg(lo, lseg); - lseg->pls_range = res->range; pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg); @@ -1470,41 +1508,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); -int pnfs_write_done_resend_to_mds(struct inode *inode, - struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq) +int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); - pgio.pg_dreq = dreq; - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - - nfs_list_remove_request(req); - if (!nfs_pageio_add_request(&pgio, req)) - nfs_list_add_request(req, &failed); - } - nfs_pageio_complete(&pgio); - - if (!list_empty(&failed)) { - /* For some reason our attempt to resend pages. Mark the - * overall send request as having failed, and let - * nfs_writeback_release_full deal with the error. - */ - list_move(&failed, head); - return -EIO; - } - return 0; + nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, + hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & @@ -1512,50 +1528,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) - data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr); } /* * Called by non rpc-based layout drivers */ -void pnfs_ld_write_done(struct nfs_pgio_data *data) +void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - - trace_nfs4_pnfs_write(data, hdr->pnfs_error); + trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(data); - hdr->mds_ops->rpc_call_done(&data->task, data); + pnfs_set_layoutcommit(hdr); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else - pnfs_ld_handle_write_error(data); - hdr->mds_ops->rpc_release(data); + pnfs_ld_handle_write_error(hdr); + hdr->mds_ops->rpc_release(hdr); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { list_splice_tail_init(&hdr->pages, &desc->pg_list); nfs_pageio_reset_write_mds(desc); desc->pg_recoalesce = 1; } - nfs_pgio_data_release(data); + nfs_pgio_data_destroy(hdr); } static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_pgio_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg, int how) { - struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); @@ -1563,8 +1571,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata, hdr->mds_ops = call_ops; dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, - inode->i_ino, wdata->args.count, wdata->args.offset, how); - trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + inode->i_ino, hdr->args.count, hdr->args.offset, how); + trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); @@ -1575,139 +1583,105 @@ static void pnfs_do_write(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, int how) { - struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_write_through_mds(desc, data); + pnfs_write_through_mds(desc, hdr); pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_rw_header_free(hdr); + nfs_pgio_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_rw_header *whdr; struct nfs_pgio_header *hdr; int ret; - whdr = nfs_rw_header_alloc(desc->pg_rw_ops); - if (!whdr) { + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return -ENOMEM; } - hdr = &whdr->header; nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); - atomic_inc(&hdr->refcnt); ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_write(desc, hdr, desc->pg_ioflags); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); -int pnfs_read_done_resend_to_mds(struct inode *inode, - struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq) +int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read(&pgio, inode, true, compl_ops); - pgio.pg_dreq = dreq; - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - - nfs_list_remove_request(req); - if (!nfs_pageio_add_request(&pgio, req)) - nfs_list_add_request(req, &failed); - } - nfs_pageio_complete(&pgio); - - if (!list_empty(&failed)) { - list_move(&failed, head); - return -EIO; - } - return 0; + nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) - data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr); } /* * Called by non rpc-based layout drivers */ -void pnfs_ld_read_done(struct nfs_pgio_data *data) +void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - - trace_nfs4_pnfs_read(data, hdr->pnfs_error); + trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(data); - hdr->mds_ops->rpc_call_done(&data->task, data); + __nfs4_read_done_cb(hdr); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else - pnfs_ld_handle_read_error(data); - hdr->mds_ops->rpc_release(data); + pnfs_ld_handle_read_error(hdr); + hdr->mds_ops->rpc_release(hdr); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { list_splice_tail_init(&hdr->pages, &desc->pg_list); nfs_pageio_reset_read_mds(desc); desc->pg_recoalesce = 1; } - nfs_pgio_data_release(data); + nfs_pgio_data_destroy(hdr); } /* * Call the appropriate parallel I/O subsystem read function. */ static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_pgio_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { - struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; @@ -1715,9 +1689,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata, hdr->mds_ops = call_ops; dprintk("%s: Reading ino:%lu %u@%llu\n", - __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + __func__, inode->i_ino, hdr->args.count, hdr->args.offset); - trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_READ); dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); @@ -1727,52 +1701,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata, static void pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_read_through_mds(desc, data); + pnfs_read_through_mds(desc, hdr); pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_rw_header_free(hdr); + nfs_pgio_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_rw_header *rhdr; struct nfs_pgio_header *hdr; int ret; - rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); - if (!rhdr) { + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; } - hdr = &rhdr->header; nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); - atomic_inc(&hdr->refcnt); ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_read(desc, hdr); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); @@ -1820,12 +1788,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) +pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = wdata->mds_offset + wdata->res.count; + loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); @@ -1851,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) +{ + struct inode *inode = data->inode; + struct nfs_inode *nfsi = NFS_I(inode); + bool mark_as_dirty = false; + + spin_lock(&inode->i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, inode->i_ino); + } + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + pnfs_get_lseg(data->lseg); + } + if (data->lwb > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = data->lwb; + spin_unlock(&inode->i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, data->lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); + void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { struct nfs_server *nfss = NFS_SERVER(data->args.inode); @@ -1871,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; @@ -1885,7 +1882,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { if (!sync) goto out; - status = wait_on_bit_lock(&nfsi->flags, + status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, TASK_KILLABLE); @@ -1921,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) data->args.lastbytewritten = end_pos - 1; data->res.server = NFS_SERVER(inode); + if (ld->prepare_layoutcommit) { + status = ld->prepare_layoutcommit(&data->args); + if (status) { + spin_lock(&inode->i_lock); + if (end_pos < nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&inode->i_lock); + put_rpccred(data->cred); + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + goto clear_layoutcommitting; + } + } + + status = nfs4_proc_layoutcommit(data, sync); out: if (status) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 4fb309a2b4c4..693ce42ec683 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -32,6 +32,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/workqueue.h> enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ @@ -46,6 +47,7 @@ struct pnfs_layout_segment { atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; + struct work_struct pls_work; }; enum pnfs_try_status { @@ -63,12 +65,15 @@ enum { NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ + NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ }; enum layoutdriver_policy_flags { - /* Should the pNFS client commit and return the layout upon a setattr */ + /* Should the pNFS client commit and return the layout upon truncate to + * a smaller size */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, PNFS_LAYOUTRET_ON_ERROR = 1 << 1, + PNFS_READ_WHOLE_PAGE = 1 << 2, }; struct nfs4_deviceid_node; @@ -80,6 +85,7 @@ struct pnfs_layoutdriver_type { const char *name; struct module *owner; unsigned flags; + unsigned max_deviceinfo_size; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); @@ -90,6 +96,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*return_range) (struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range); + /* test for nfs page cache coalescing */ const struct nfs_pageio_ops *pg_read_ops; const struct nfs_pageio_ops *pg_write_ops; @@ -104,6 +113,8 @@ struct pnfs_layoutdriver_type { int max); void (*recover_commit_reqs) (struct list_head *list, struct nfs_commit_info *cinfo); + struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, + struct page *page); int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how, @@ -113,18 +124,21 @@ struct pnfs_layoutdriver_type { * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS */ - enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); - enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); + enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *); + enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); void (*free_deviceid_node) (struct nfs4_deviceid_node *); + struct nfs4_deviceid_node * (*alloc_deviceid_node) + (struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); - - void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); }; @@ -167,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ -extern int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); @@ -179,6 +190,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); +void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); @@ -213,13 +225,14 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); +void pnfs_set_layoutcommit(struct nfs_pgio_header *); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); -void pnfs_ld_write_done(struct nfs_pgio_data *); -void pnfs_ld_read_done(struct nfs_pgio_data *); +void pnfs_ld_write_done(struct nfs_pgio_header *); +void pnfs_ld_read_done(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -228,12 +241,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, gfp_t gfp_flags); void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); -int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq); -int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq); +int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); +int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ @@ -254,11 +263,12 @@ struct nfs4_deviceid_node { atomic_t ref; }; -struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, - const struct pnfs_layoutdriver_type *, - const struct nfs_client *, +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); @@ -266,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); +static inline struct nfs4_deviceid_node * +nfs4_get_deviceid(struct nfs4_deviceid_node *d) +{ + atomic_inc(&d->ref); + return d; +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -345,6 +362,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); } +static inline struct nfs_page * +pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, + struct page *page) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->search_commit_reqs == NULL) + return NULL; + return ld->search_commit_reqs(cinfo, page); +} + /* Should the pNFS client commit and return the layout upon a setattr */ static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) @@ -356,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; +} + +static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); @@ -410,6 +446,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { } +static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg) +{ +} + static inline int pnfs_return_layout(struct inode *ino) { return 0; @@ -427,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + return false; +} + +static inline bool pnfs_roc(struct inode *ino) { return false; @@ -496,6 +542,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, { } +static inline struct nfs_page * +pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, + struct page *page) +{ + return NULL; +} + static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { return 0; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6da209bd9408..aa2ec0015183 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -29,6 +29,9 @@ */ #include <linux/export.h> +#include <linux/nfs_fs.h> +#include "nfs4session.h" +#include "internal.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, return NULL; } +static struct nfs4_deviceid_node * +nfs4_get_device_info(struct nfs_server *server, + const struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d = NULL; + struct pnfs_device *pdev = NULL; + struct page **pages = NULL; + u32 max_resp_sz; + int max_pages; + int rc, i; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + if (server->pnfs_curr_ld->max_deviceinfo_size && + server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) + max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s: server %p max_resp_sz %u max_pages %d\n", + __func__, server, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(*pdev), gfp_flags); + if (!pdev) + return NULL; + + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); + if (!pages) + goto out_free_pdev; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free_pages; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = server->pnfs_curr_ld->id; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = max_resp_sz; + pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free_pages; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, + gfp_flags); + +out_free_pages: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); +out_free_pdev: + kfree(pdev); + dprintk("<-- %s d %p\n", __func__, d); + return d; +} + /* * Lookup a deviceid in cache and get a reference count on it if found * @@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, * @id deviceid to look up */ static struct nfs4_deviceid_node * -_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id, - long hash) +__nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, long hash) { struct nfs4_deviceid_node *d; rcu_read_lock(); - d = _lookup_deviceid(ld, clp, id, hash); + d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, + hash); if (d != NULL) atomic_inc(&d->ref); rcu_read_unlock(); @@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, } struct nfs4_deviceid_node * -nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) { - return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + long hash = nfs4_deviceid_hash(id); + struct nfs4_deviceid_node *d, *new; + + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) + return d; + + new = nfs4_get_device_info(server, id, cred, gfp_mask); + if (!new) + return new; + + spin_lock(&nfs4_deviceid_lock); + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + server->pnfs_curr_ld->free_deviceid_node(new); + return d; + } + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + atomic_inc(&new->ref); + spin_unlock(&nfs4_deviceid_lock); + + return new; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); void -nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, - const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *nfs_client, +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); INIT_HLIST_NODE(&d->tmpnode); - d->ld = ld; - d->nfs_client = nfs_client; + d->ld = server->pnfs_curr_ld; + d->nfs_client = server->nfs_client; d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); @@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); /* - * Uniquely initialize and insert a deviceid node into cache - * - * @new new deviceid node - * Note that the caller must set up the following members: - * new->ld - * new->nfs_client - * new->deviceid - * - * @ret the inserted node, if none found, otherwise, the found entry. - */ -struct nfs4_deviceid_node * -nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) -{ - struct nfs4_deviceid_node *d; - long hash; - - spin_lock(&nfs4_deviceid_lock); - hash = nfs4_deviceid_hash(&new->deviceid); - d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); - if (d) { - spin_unlock(&nfs4_deviceid_lock); - return d; - } - - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - spin_unlock(&nfs4_deviceid_lock); - atomic_inc(&new->ref); - - return new; -} -EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); - -/* * Dereference a deviceid node and delete it when its reference count drops * to zero. * @@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) } rcu_read_unlock(); } - diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c171ce1a8a30..b09cc23d6f43 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; nfs_invalidate_atime(inode); if (task->tk_status >= 0) { - nfs_refresh_inode(inode, data->res.fattr); + nfs_refresh_inode(inode, hdr->res.fattr); /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (data->args.offset + data->res.count >= data->res.fattr->size) - data->res.eof = 1; + if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) + hdr->res.eof = 1; } return 0; } -static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) { rpc_call_start(task); return 0; } -static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); return 0; } -static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ - data->args.stable = NFS_FILE_SYNC; + hdr->args.stable = NFS_FILE_SYNC; msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e818a475ca64..beff2769c5c5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; -static struct nfs_rw_header *nfs_readhdr_alloc(void) +static struct nfs_pgio_header *nfs_readhdr_alloc(void) { return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); } -static void nfs_readhdr_free(struct nfs_rw_header *rhdr) +static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { kmem_cache_free(nfs_rdata_cachep, rhdr); } @@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req) unlock_page(req->wb_page); } - - dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", - req->wb_context->dentry->d_inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); nfs_release_request(req); } @@ -172,14 +166,15 @@ out: hdr->release(hdr); } -static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, +static void nfs_initiate_read(struct nfs_pgio_header *hdr, + struct rpc_message *msg, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; task_setup_data->flags |= swap_flags; - NFS_PROTO(inode)->read_setup(data, msg); + NFS_PROTO(inode)->read_setup(hdr, msg); } static void @@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, +static int nfs_readpage_done(struct rpc_task *task, + struct nfs_pgio_header *hdr, struct inode *inode) { - int status = NFS_PROTO(inode)->read_done(task, data); + int status = NFS_PROTO(inode)->read_done(task, hdr); if (status != 0) return status; - nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count); + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count); if (task->tk_status == -ESTALE) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); @@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, return 0; } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) +static void nfs_readpage_retry(struct rpc_task *task, + struct nfs_pgio_header *hdr) { - struct nfs_pgio_args *argp = &data->args; - struct nfs_pgio_res *resp = &data->res; + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; /* This is a short read! */ - nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); + nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count == 0) { - nfs_set_pgio_error(data->header, -EIO, argp->offset); + nfs_set_pgio_error(hdr, -EIO, argp->offset); return; } - /* Yes, so retry the read at the end of the data */ - data->mds_offset += resp->count; + /* Yes, so retry the read at the end of the hdr */ + hdr->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; rpc_restart_call_prepare(task); } -static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) +static void nfs_readpage_result(struct rpc_task *task, + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - - if (data->res.eof) { + if (hdr->res.eof) { loff_t bound; - bound = data->args.offset + data->res.count; + bound = hdr->args.offset + hdr->res.count; spin_lock(&hdr->lock); if (bound < hdr->io_start + hdr->good_bytes) { set_bit(NFS_IOHDR_EOF, &hdr->flags); @@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat hdr->good_bytes = bound - hdr->io_start; } spin_unlock(&hdr->lock); - } else if (data->res.count != data->args.count) - nfs_readpage_retry(task, data); + } else if (hdr->res.count != hdr->args.count) + nfs_readpage_retry(task, hdr); } /* @@ -404,7 +400,7 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_rw_header), + sizeof(struct nfs_pgio_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 084af1060d79..31a11b0e885d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, rpc_authflavor_t flavor) { unsigned int i; - unsigned int max_flavor_len = (sizeof(auth_info->flavors) / - sizeof(auth_info->flavors[0])); + unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); /* make sure this flavor isn't already in the list */ for (i = 0; i < auth_info->flavor_len; i++) { @@ -2066,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options, return NFS_TEXT_DATA; } -#if !IS_ENABLED(CONFIG_NFS_V3) - if (args->version == 3) - goto out_v3_not_compiled; -#endif /* !CONFIG_NFS_V3 */ - return 0; out_no_data: @@ -2086,12 +2080,6 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -#if !IS_ENABLED(CONFIG_NFS_V3) -out_v3_not_compiled: - dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); - return -EPROTONOSUPPORT; -#endif /* !CONFIG_NFS_V3 */ - out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -2180,7 +2168,7 @@ out_no_address: return -EINVAL; } -#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ +#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ | NFS_MOUNT_SECURE \ | NFS_MOUNT_TCP \ | NFS_MOUNT_VER3 \ @@ -2188,15 +2176,16 @@ out_no_address: | NFS_MOUNT_NONLM \ | NFS_MOUNT_BROKEN_SUID \ | NFS_MOUNT_STRICTLOCK \ - | NFS_MOUNT_UNSHARED \ - | NFS_MOUNT_NORESVPORT \ | NFS_MOUNT_LEGACY_INTERFACE) +#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \ + ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT)) + static int nfs_compare_remount_data(struct nfs_server *nfss, struct nfs_parsed_mount_data *data) { - if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || + if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK || data->rsize != nfss->rsize || data->wsize != nfss->wsize || data->version != nfss->nfs_client->rpc_ops->version || diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5e2f10304548..12493846a2d3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -47,6 +47,11 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode); +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -71,18 +76,18 @@ void nfs_commit_free(struct nfs_commit_data *p) } EXPORT_SYMBOL_GPL(nfs_commit_free); -static struct nfs_rw_header *nfs_writehdr_alloc(void) +static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); if (p) memset(p, 0, sizeof(*p)); return p; } -static void nfs_writehdr_free(struct nfs_rw_header *whdr) +static void nfs_writehdr_free(struct nfs_pgio_header *hdr) { - mempool_free(whdr, nfs_wdata_mempool); + mempool_free(hdr, nfs_wdata_mempool); } static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) @@ -106,21 +111,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) if (PagePrivate(page)) req = (struct nfs_page *)page_private(page); - else if (unlikely(PageSwapCache(page))) { - struct nfs_page *freq, *t; - - /* Linearly search the commit list for the correct req */ - list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { - if (freq->wb_page == page) { - req = freq->wb_head; - break; - } - } - } + else if (unlikely(PageSwapCache(page))) + req = nfs_page_search_commits_for_head_request_locked(nfsi, + page); if (req) { WARN_ON_ONCE(req->wb_head != req); - kref_get(&req->wb_kref); } @@ -216,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) unsigned int pos = 0; unsigned int len = nfs_page_length(req->wb_page); - nfs_page_group_lock(req); + nfs_page_group_lock(req, false); do { tmp = nfs_page_group_search_locked(req->wb_head, pos); @@ -246,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { + int ret = 0; if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) + ret = FLUSH_COND_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI | FLUSH_COND_STABLE; - return FLUSH_COND_STABLE; + ret |= FLUSH_LOWPRI; + return ret; } /* @@ -379,8 +378,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, subreq->wb_head = subreq; subreq->wb_this_page = subreq; - nfs_clear_request_commit(subreq); - /* subreq is now totally disconnected from page group or any * write / commit lists. last chance to wake any waiters */ nfs_unlock_request(subreq); @@ -455,8 +452,23 @@ try_again: return NULL; } + /* holding inode lock, so always make a non-blocking call to try the + * page group lock */ + ret = nfs_page_group_lock(head, true); + if (ret < 0) { + spin_unlock(&inode->i_lock); + + if (!nonblock && ret == -EAGAIN) { + nfs_page_group_lock_wait(head); + nfs_release_request(head); + goto try_again; + } + + nfs_release_request(head); + return ERR_PTR(ret); + } + /* lock each request in the page group */ - nfs_page_group_lock(head); subreq = head; do { /* @@ -488,7 +500,7 @@ try_again: * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_list_remove_request(subreq); + nfs_clear_request_commit(subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -518,15 +530,11 @@ try_again: nfs_page_group_unlock(head); - /* drop lock to clear_request_commit the head req and clean up - * requests on destroy list */ + /* drop lock to clean uprequests on destroy list */ spin_unlock(&inode->i_lock); nfs_destroy_unlinked_subrequests(destroy_list, head); - /* clean up commit list state */ - nfs_clear_request_commit(head); - /* still holds ref on head from nfs_page_find_head_request_locked * and still has lock on head from lock loop */ return head; @@ -623,7 +631,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int err; /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, nfs_wait_bit_killable, TASK_KILLABLE); if (err) goto out_err; @@ -697,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(!PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); + smp_mb__after_atomic(); + wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->npages--; @@ -705,6 +715,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) nfs_release_request(req); + else + WARN_ON_ONCE(1); } static void @@ -713,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_page_search_commits_for_head_request_locked + * + * Search through commit lists on @inode for the head request for @page. + * Must be called while holding the inode (which is cinfo) lock. + * + * Returns the head request if found, or NULL if not found. + */ +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page) +{ + struct nfs_page *freq, *t; + struct nfs_commit_info cinfo; + struct inode *inode = &nfsi->vfs_inode; + + nfs_init_cinfo_from_inode(&cinfo, inode); + + /* search through pnfs commit lists */ + freq = pnfs_search_commit_reqs(inode, &cinfo, page); + if (freq) + return freq->wb_head; + + /* Linearly search the commit list for the correct request */ + list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + + return NULL; +} + /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page @@ -808,6 +851,7 @@ nfs_clear_page_commit(struct page *page) dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); } +/* Called holding inode (/cinfo) lock */ static void nfs_clear_request_commit(struct nfs_page *req) { @@ -817,53 +861,19 @@ nfs_clear_request_commit(struct nfs_page *req) nfs_init_cinfo_from_inode(&cinfo, inode); if (!pnfs_clear_request_commit(req, &cinfo)) { - spin_lock(cinfo.lock); nfs_request_remove_commit_list(req, &cinfo); - spin_unlock(cinfo.lock); } nfs_clear_page_commit(req->wb_page); } } -static inline -int nfs_write_need_commit(struct nfs_pgio_data *data) +int nfs_write_need_commit(struct nfs_pgio_header *hdr) { - if (data->verf.committed == NFS_DATA_SYNC) - return data->header->lseg == NULL; - return data->verf.committed != NFS_FILE_SYNC; + if (hdr->verf.committed == NFS_DATA_SYNC) + return hdr->lseg == NULL; + return hdr->verf.committed != NFS_FILE_SYNC; } -#else -static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, - struct inode *inode) -{ -} - -void nfs_init_cinfo(struct nfs_commit_info *cinfo, - struct inode *inode, - struct nfs_direct_req *dreq) -{ -} - -void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) -{ -} - -static void -nfs_clear_request_commit(struct nfs_page *req) -{ -} - -static inline -int nfs_write_need_commit(struct nfs_pgio_data *data) -{ - return 0; -} - -#endif - static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; @@ -883,11 +893,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_context_set_write_error(req->wb_context, hdr->error); goto remove_req; } - if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { - nfs_mark_request_dirty(req); - goto next; - } - if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + if (nfs_write_need_commit(hdr)) { memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; @@ -903,7 +909,6 @@ out: hdr->release(hdr); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { @@ -960,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, return ret; } -#else -unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) -{ - return 0; -} - -int nfs_scan_commit(struct inode *inode, struct list_head *dst, - struct nfs_commit_info *cinfo) -{ - return 0; -} -#endif - /* * Search for an existing write request, and attempt to update * it to reflect a new dirty region on a given page. @@ -1038,9 +1030,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, else req->wb_bytes = rqend - req->wb_offset; out_unlock: - spin_unlock(&inode->i_lock); if (req) nfs_clear_request_commit(req); + spin_unlock(&inode->i_lock); return req; out_flushme: spin_unlock(&inode->i_lock); @@ -1241,17 +1233,18 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, +static void nfs_initiate_write(struct nfs_pgio_header *hdr, + struct rpc_message *msg, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = data->header->inode; + struct inode *inode = hdr->inode; int priority = flush_task_priority(how); task_setup_data->priority = priority; - NFS_PROTO(inode)->write_setup(data, msg); + NFS_PROTO(inode)->write_setup(hdr, msg); nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, - &task_setup_data->rpc_client, msg, data); + &task_setup_data->rpc_client, msg, hdr); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1313,21 +1306,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -static void nfs_writeback_release_common(struct nfs_pgio_data *data) +static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - int status = data->task.tk_status; - - if ((status >= 0) && nfs_write_need_commit(data)) { - spin_lock(&hdr->lock); - if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) - ; /* Do nothing */ - else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); - else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) - set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); - spin_unlock(&hdr->lock); - } + /* do nothing! */ } /* @@ -1358,7 +1339,8 @@ static int nfs_should_remove_suid(const struct inode *inode) /* * This function is called when the WRITE call is complete. */ -static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, +static int nfs_writeback_done(struct rpc_task *task, + struct nfs_pgio_header *hdr, struct inode *inode) { int status; @@ -1370,13 +1352,13 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, * another writer had changed the file, but some applications * depend on tighter cache coherency when writing. */ - status = NFS_PROTO(inode)->write_done(task, data); + status = NFS_PROTO(inode)->write_done(task, hdr); if (status != 0) return status; - nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) - if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { + if (hdr->res.verf->committed < hdr->args.stable && + task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we * requested it. @@ -1392,11 +1374,10 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(inode)->nfs_client->cl_hostname, - data->res.verf->committed, data->args.stable); + hdr->res.verf->committed, hdr->args.stable); complain = jiffies + 300 * HZ; } } -#endif /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) @@ -1407,16 +1388,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, /* * This function is called when the WRITE call is complete. */ -static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +static void nfs_writeback_result(struct rpc_task *task, + struct nfs_pgio_header *hdr) { - struct nfs_pgio_args *argp = &data->args; - struct nfs_pgio_res *resp = &data->res; + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; if (resp->count < argp->count) { static unsigned long complain; /* This a short write! */ - nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); + nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ if (resp->count == 0) { @@ -1426,14 +1408,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da argp->count); complain = jiffies + 300 * HZ; } - nfs_set_pgio_error(data->header, -EIO, argp->offset); + nfs_set_pgio_error(hdr, -EIO, argp->offset); task->tk_status = -EIO; return; } /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ - data->mds_offset += resp->count; + hdr->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; @@ -1448,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { int ret; @@ -1517,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, } EXPORT_SYMBOL_GPL(nfs_initiate_commit); +static loff_t nfs_get_lwb(struct list_head *head) +{ + loff_t lwb = 0; + struct nfs_page *req; + + list_for_each_entry(req, head, wb_list) + if (lwb < (req_offset(req) + req->wb_bytes)) + lwb = req_offset(req) + req->wb_bytes; + + return lwb; +} + /* * Set up the argument/result storage required for the RPC call. */ @@ -1536,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data, data->inode = inode; data->cred = first->wb_context->cred; data->lseg = lseg; /* reference transferred */ + /* only set lwb for pnfs commit */ + if (lseg) + data->lwb = nfs_get_lwb(&data->pages); data->mds_ops = &nfs_commit_ops; data->completion_ops = cinfo->completion_ops; data->dreq = cinfo->dreq; @@ -1615,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; + struct nfs_server *nfss; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1648,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) next: nfs_unlock_and_release_request(req); } + nfss = NFS_SERVER(data->inode); + if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + nfs_init_cinfo(&cinfo, data->inode, data->dreq); if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) nfs_commit_clear_lock(NFS_I(data->inode)); @@ -1703,7 +1704,7 @@ int nfs_commit_inode(struct inode *inode, int how) return error; if (!may_wait) goto out_mark_dirty; - error = wait_on_bit(&NFS_I(inode)->flags, + error = wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); @@ -1757,12 +1758,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } -#else -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) -{ - return 0; -} -#endif int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -1884,7 +1879,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_rw_header), + sizeof(struct nfs_pgio_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index f689ed82af3a..d153ca3ea577 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o - nfs_acl-objs := nfsacl.o + +obj-$(CONFIG_GRACE_PERIOD) += grace.o diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c index 6d1ee7204c88..ae6e58ea4de5 100644 --- a/fs/lockd/grace.c +++ b/fs/nfs_common/grace.c @@ -1,17 +1,20 @@ /* * Common code for control of lockd and nfsv4 grace periods. + * + * Transplanted from lockd code */ #include <linux/module.h> -#include <linux/lockd/bind.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <linux/fs.h> -#include "netns.h" - +static int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace + * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given @@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock); * * This function is called to start a grace period. */ -void locks_start_grace(struct net *net, struct lock_manager *lm) +void +locks_start_grace(struct net *net, struct lock_manager *lm) { - struct lockd_net *ln = net_generic(net, lockd_net_id); + struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, &ln->grace_list); + list_add(&lm->list, grace_list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace + * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to @@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace); * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ -void locks_end_grace(struct lock_manager *lm) +void +locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); @@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ -int locks_in_grace(struct net *net) +int +locks_in_grace(struct net *net) { - struct lockd_net *ln = net_generic(net, lockd_net_id); + struct list_head *grace_list = net_generic(net, grace_net_id); - return !list_empty(&ln->grace_list); + return !list_empty(grace_list); } EXPORT_SYMBOL_GPL(locks_in_grace); + +static int __net_init +grace_init_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + INIT_LIST_HEAD(grace_list); + return 0; +} + +static void __net_exit +grace_exit_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + BUG_ON(!list_empty(grace_list)); +} + +static struct pernet_operations grace_net_ops = { + .init = grace_init_net, + .exit = grace_exit_net, + .id = &grace_net_id, + .size = sizeof(struct list_head), +}; + +static int __init +init_grace(void) +{ + return register_pernet_subsys(&grace_net_ops); +} + +static void __exit +exit_grace(void) +{ + unregister_pernet_subsys(&grace_net_ops); +} + +MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); +MODULE_LICENSE("GPL"); +module_init(init_grace) +module_exit(exit_grace) diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index ed628f71274c..538f142935ea 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -30,9 +30,6 @@ MODULE_LICENSE("GPL"); -EXPORT_SYMBOL_GPL(nfsacl_encode); -EXPORT_SYMBOL_GPL(nfsacl_decode); - struct nfsacl_encode_desc { struct xdr_array2_desc desc; unsigned int count; @@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, nfsacl_desc.desc.array_len; return err; } +EXPORT_SYMBOL_GPL(nfsacl_encode); struct nfsacl_decode_desc { struct xdr_array2_desc desc; @@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, return 8 + nfsacl_desc.desc.elem_size * nfsacl_desc.desc.array_len; } +EXPORT_SYMBOL_GPL(nfsacl_decode); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f994e750e0d1..73395156bdb4 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -71,6 +71,7 @@ config NFSD_V4 select FS_POSIX_ACL select SUNRPC_GSS select CRYPTO + select GRACE_PERIOD help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). @@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. - WARNING: there is still a chance of backwards-incompatible protocol changes. - For now we recommend "Y" only for developers and testers. - config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" depends on NFSD_V4 && DEBUG_KERNEL diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index a986ceb6fd0d..4cd7c69a6cb9 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -47,7 +47,7 @@ struct svc_rqst; #define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ / sizeof(struct nfs4_ace)) -struct nfs4_acl *nfs4_acl_new(int); +int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 72f44823adbb..9d46a0bdd9f9 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -28,7 +28,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) validate_process_creds(); /* discard any old override before preparing the new set */ - revert_creds(get_cred(current->real_cred)); + revert_creds(get_cred(current_real_cred())); new = prepare_creds(); if (!new) return -ENOMEM; diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index b582f9ab6b2a..dd96a3830004 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -18,7 +18,6 @@ * is much larger than a sockaddr_in6. */ struct svc_cacherep { - struct hlist_node c_hash; struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 13b85f94d9e2..30a739d896ff 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -698,8 +698,8 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; - new->ex_path.dentry = dget(item->ex_path.dentry); - new->ex_path.mnt = mntget(item->ex_path.mnt); + new->ex_path = item->ex_path; + path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -1145,6 +1145,7 @@ static struct flags { { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, @@ -1253,7 +1254,7 @@ static int e_show(struct seq_file *m, void *p) return 0; } - cache_get(&exp->h); + exp_get(exp); if (cache_check(cd, &exp->h, NULL)) return 0; exp_put(exp); diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index cfeea85c5bed..04dc8c167b0c 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -101,9 +101,10 @@ static inline void exp_put(struct svc_export *exp) cache_put(&exp->h, exp->cd); } -static inline void exp_get(struct svc_export *exp) +static inline struct svc_export *exp_get(struct svc_export *exp) { cache_get(&exp->h); + return exp; } struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 2ed05c3cd43d..c16bf5af6831 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -17,81 +17,13 @@ struct nfsd_fault_inject_op { char *file; - u64 (*forget)(struct nfs4_client *, u64); - u64 (*print)(struct nfs4_client *, u64); + u64 (*get)(void); + u64 (*set_val)(u64); + u64 (*set_clnt)(struct sockaddr_storage *, size_t); }; -static struct nfsd_fault_inject_op inject_ops[] = { - { - .file = "forget_clients", - .forget = nfsd_forget_client, - .print = nfsd_print_client, - }, - { - .file = "forget_locks", - .forget = nfsd_forget_client_locks, - .print = nfsd_print_client_locks, - }, - { - .file = "forget_openowners", - .forget = nfsd_forget_client_openowners, - .print = nfsd_print_client_openowners, - }, - { - .file = "forget_delegations", - .forget = nfsd_forget_client_delegations, - .print = nfsd_print_client_delegations, - }, - { - .file = "recall_delegations", - .forget = nfsd_recall_client_delegations, - .print = nfsd_print_client_delegations, - }, -}; - -static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); static struct dentry *debug_dir; -static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val) -{ - u64 count = 0; - - if (val == 0) - printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); - else - printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); - - nfs4_lock_state(); - count = nfsd_for_n_state(val, op->forget); - nfs4_unlock_state(); - printk(KERN_INFO "NFSD: %s: found %llu", op->file, count); -} - -static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op, - struct sockaddr_storage *addr, - size_t addr_size) -{ - char buf[INET6_ADDRSTRLEN]; - struct nfs4_client *clp; - u64 count; - - nfs4_lock_state(); - clp = nfsd_find_client(addr, addr_size); - if (clp) { - count = op->forget(clp, 0); - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count); - } - nfs4_unlock_state(); -} - -static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val) -{ - nfs4_lock_state(); - *val = nfsd_for_n_state(0, op->print); - nfs4_unlock_state(); -} - static ssize_t fault_inject_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { @@ -99,9 +31,10 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, char read_buf[25]; size_t size; loff_t pos = *ppos; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; if (!pos) - nfsd_inject_get(file_inode(file)->i_private, &val); + val = op->get(); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); return simple_read_from_buffer(buf, len, ppos, read_buf, size); @@ -114,18 +47,36 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf, size_t size = min(sizeof(write_buf) - 1, len); struct net *net = current->nsproxy->net_ns; struct sockaddr_storage sa; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; u64 val; + char *nl; if (copy_from_user(write_buf, buf, size)) return -EFAULT; write_buf[size] = '\0'; + /* Deal with any embedded newlines in the string */ + nl = strchr(write_buf, '\n'); + if (nl) { + size = nl - write_buf; + *nl = '\0'; + } + size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); - if (size > 0) - nfsd_inject_set_client(file_inode(file)->i_private, &sa, size); - else { + if (size > 0) { + val = op->set_clnt(&sa, size); + if (val) + pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", + op->file, write_buf, val); + } else { val = simple_strtoll(write_buf, NULL, 0); - nfsd_inject_set(file_inode(file)->i_private, val); + if (val == 0) + pr_info("NFSD Fault Injection: %s (all)", op->file); + else + pr_info("NFSD Fault Injection: %s (n = %llu)", + op->file, val); + val = op->set_val(val); + pr_info("NFSD: %s: found %llu", op->file, val); } return len; /* on success, claim we got the whole input */ } @@ -141,6 +92,41 @@ void nfsd_fault_inject_cleanup(void) debugfs_remove_recursive(debug_dir); } +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .get = nfsd_inject_print_clients, + .set_val = nfsd_inject_forget_clients, + .set_clnt = nfsd_inject_forget_client, + }, + { + .file = "forget_locks", + .get = nfsd_inject_print_locks, + .set_val = nfsd_inject_forget_locks, + .set_clnt = nfsd_inject_forget_client_locks, + }, + { + .file = "forget_openowners", + .get = nfsd_inject_print_openowners, + .set_val = nfsd_inject_forget_openowners, + .set_clnt = nfsd_inject_forget_client_openowners, + }, + { + .file = "forget_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_forget_delegations, + .set_clnt = nfsd_inject_forget_client_delegations, + }, + { + .file = "recall_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_recall_delegations, + .set_clnt = nfsd_inject_recall_client_delegations, + }, +}; + +#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op)) + int nfsd_fault_inject_init(void) { unsigned int i; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d32b3aa6600d..ea6749a32760 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -29,14 +29,19 @@ #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define LOCKOWNER_INO_HASH_BITS 8 -#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) - #define SESSION_HASH_SIZE 512 struct cld_net; struct nfsd4_client_tracking_ops; +/* + * Represents a nfsd "container". With respect to nfsv4 state tracking, the + * fields of interest are the *_id_hashtbls and the *_name_tree. These track + * the nfs4_client objects by either short or long form clientid. + * + * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean + * up expired clients and delegations within the container. + */ struct nfsd_net { struct cld_net *cld_net; @@ -66,8 +71,6 @@ struct nfsd_net { struct rb_root conf_name_tree; struct list_head *unconf_id_hashtbl; struct rb_root unconf_name_tree; - struct list_head *ownerstr_hashtbl; - struct list_head *lockowner_ino_hashtbl; struct list_head *sessionid_hashtbl; /* * client_lru holds client queue ordered by nfs4_client.cl_time @@ -97,10 +100,16 @@ struct nfsd_net { bool nfsd_net_up; bool lockd_up; + /* Time of server startup */ + struct timeval nfssvc_boot; + /* - * Time of server startup + * Max number of connections this nfsd container will allow. Defaults + * to '0' which is means that it bases this on the number of threads. */ - struct timeval nfssvc_boot; + unsigned int max_connections; + + u32 clientid_counter; struct svc_serv *nfsd_serv; }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 12b023a7ab7d..ac54ea60b3f6 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -54,14 +54,14 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); - goto fail; - } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } resp->acl_access = acl; } if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2a514e21dc74..34cbbab6abd7 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -47,14 +47,14 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); - goto fail; - } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } resp->acl_access = acl; } if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 401289913130..12f2aab4f614 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -157,11 +157,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - - resp->count = argp->count; - if (max_blocksize < resp->count) - resp->count = max_blocksize; - + resp->count = min(argp->count, max_blocksize); svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); @@ -227,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, newfhp = fh_init(&resp->fh, NFS3_FHSIZE); attr = &argp->attrs; - /* Get the directory inode */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE); - if (nfserr) - RETURN_STATUS(nfserr); - /* Unfudge the mode bits */ attr->ia_mode &= ~S_IFMT; if (!(attr->ia_valid & ATTR_MODE)) { @@ -286,8 +277,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp, fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, - argp->tname, argp->tlen, - &resp->fh, &argp->attrs); + argp->tname, &resp->fh); RETURN_STATUS(nfserr); } @@ -476,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; + + nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) + RETURN_STATUS(nfserr_notsupp); + nfserr = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e6c01e80325e..39c5eb3ad33a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -120,10 +120,7 @@ decode_sattr3(__be32 *p, struct iattr *iap) iap->ia_valid |= ATTR_SIZE; p = xdr_decode_hyper(p, &newsize); - if (newsize <= NFS_OFFSET_MAX) - iap->ia_size = newsize; - else - iap->ia_size = NFS_OFFSET_MAX; + iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); } if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ iap->ia_valid |= ATTR_ATIME; @@ -338,10 +335,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, return 0; p = xdr_decode_hyper(p, &args->offset); - len = args->count = ntohl(*p++); - - if (len > max_blocksize) - len = max_blocksize; + args->count = ntohl(*p++); + len = min(args->count, max_blocksize); /* set up the kvec */ v=0; @@ -349,7 +344,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct page *p = *(rqstp->rq_next_page++); rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); len -= rqstp->rq_vec[v].iov_len; v++; } @@ -484,9 +479,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, } /* now copy next page if there is one */ if (len && !avail && rqstp->rq_arg.page_len) { - avail = rqstp->rq_arg.page_len; - if (avail > PAGE_SIZE) - avail = PAGE_SIZE; + avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); old = page_address(rqstp->rq_arg.pages[0]); } while (len && avail && *old) { @@ -571,10 +564,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - - if (args->count > PAGE_SIZE) - args->count = PAGE_SIZE; - + args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -595,10 +585,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->dircount = ntohl(*p++); args->count = ntohl(*p++); - len = (args->count > max_blocksize) ? max_blocksize : - args->count; - args->count = len; - + len = args->count = min(args->count, max_blocksize); while (len > 0) { struct page *p = *(rqstp->rq_next_page++); if (!args->buffer) @@ -913,8 +900,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, */ /* truncate filename if too long */ - if (namlen > NFS3_MAXNAMLEN) - namlen = NFS3_MAXNAMLEN; + namlen = min(namlen, NFS3_MAXNAMLEN); slen = XDR_QUADLEN(namlen); elen = slen + NFS3_ENTRY_BAGGAGE diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index d714156a19fd..59fd76651781 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -146,35 +146,43 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, int size = 0; pacl = get_acl(inode, ACL_TYPE_ACCESS); - if (!pacl) { + if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); - if (IS_ERR(pacl)) - return PTR_ERR(pacl); - } + + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + /* allocate for worst case: one (deny, allow) pair each: */ size += 2 * pacl->a_count; if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(dpacl)) { + error = PTR_ERR(dpacl); + goto rel_pacl; + } + if (dpacl) size += 2 * dpacl->a_count; } - *acl = nfs4_acl_new(size); + *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL); if (*acl == NULL) { error = -ENOMEM; goto out; } + (*acl)->naces = 0; _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); - out: - posix_acl_release(pacl); +out: posix_acl_release(dpacl); +rel_pacl: + posix_acl_release(pacl); return error; } @@ -872,16 +880,13 @@ ace2type(struct nfs4_ace *ace) return -1; } -struct nfs4_acl * -nfs4_acl_new(int n) +/* + * return the size of the struct nfs4_acl required to represent an acl + * with @entries entries. + */ +int nfs4_acl_bytes(int entries) { - struct nfs4_acl *acl; - - acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL); - if (acl == NULL) - return NULL; - acl->naces = 0; - return acl; + return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace); } static struct { @@ -935,5 +940,5 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) return 0; } WARN_ON_ONCE(1); - return -1; + return nfserr_serverfault; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 2c73cae9899d..ed2b1151b171 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); /* Index of predefined Linux callback client operations */ -enum { - NFSPROC4_CLNT_CB_NULL = 0, - NFSPROC4_CLNT_CB_RECALL, - NFSPROC4_CLNT_CB_SEQUENCE, -}; - struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ @@ -337,7 +331,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ - encode_nfs_fh4(xdr, &dp->dl_fh); + encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle); hdr->nops++; } @@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfsd4_callback *cb) { - const struct nfs4_delegation *args = cb->cb_op; + const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, .minorversion = cb->cb_minorversion, @@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); - encode_cb_recall4args(xdr, args, &hdr); + encode_cb_recall4args(xdr, dp, &hdr); encode_cb_nops(&hdr); } @@ -678,7 +672,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; args.client_name = clp->cl_cred.cr_principal; - args.prognumber = conn->cb_prog, + args.prognumber = conn->cb_prog; args.protocol = XPRT_TRANSPORT_TCP; args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; @@ -689,7 +683,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; - args.protocol = XPRT_TRANSPORT_BC_TCP; + args.protocol = conn->cb_xprt->xpt_class->xcl_ident | + XPRT_TRANSPORT_BC; args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ @@ -745,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { static struct workqueue_struct *callback_wq; -static void run_nfsd4_cb(struct nfsd4_callback *cb) -{ - queue_work(callback_wq, &cb->cb_work); -} - -static void do_probe_callback(struct nfs4_client *clp) -{ - struct nfsd4_callback *cb = &clp->cl_cb_null; - - cb->cb_op = NULL; - cb->cb_clp = clp; - - cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; - cb->cb_msg.rpc_argp = NULL; - cb->cb_msg.rpc_resp = NULL; - - cb->cb_ops = &nfsd4_cb_probe_ops; - - run_nfsd4_cb(cb); -} - /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. @@ -774,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp) { clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); - do_probe_callback(clp); + nfsd4_run_cb(&clp->cl_cb_null); } void nfsd4_probe_callback_sync(struct nfs4_client *clp) @@ -846,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); - - /* We're done looking into the sequence information */ - task->tk_msg.rpc_resp = NULL; } -} - -static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) -{ - struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = cb->cb_clp; - struct rpc_clnt *current_rpc_client = clp->cl_cb_client; - - nfsd4_cb_done(task, calldata); - - if (current_rpc_client != task->tk_client) { + if (clp->cl_cb_client != task->tk_client) { /* We're shutting down or changing cl_cb_client; leave * it to nfsd4_process_cb_update to restart the call if * necessary. */ @@ -871,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) if (cb->cb_done) return; - switch (task->tk_status) { + + switch (cb->cb_ops->done(cb, task)) { case 0: - cb->cb_done = true; + task->tk_status = 0; + rpc_restart_call_prepare(task); return; - case -EBADHANDLE: - case -NFS4ERR_BAD_STATEID: - /* Race: client probably got cb_recall - * before open reply granting delegation */ + case 1: break; - default: + case -1: /* Network partition? */ nfsd4_mark_cb_down(clp, task->tk_status); + break; + default: + BUG(); } - if (dp->dl_retries--) { - rpc_delay(task, 2*HZ); - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; - } - nfsd4_mark_cb_down(clp, task->tk_status); cb->cb_done = true; } -static void nfsd4_cb_recall_release(void *calldata) +static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); if (cb->cb_done) { spin_lock(&clp->cl_lock); list_del(&cb->cb_per_client); spin_unlock(&clp->cl_lock); - nfs4_put_delegation(dp); + + cb->cb_ops->release(cb); } } -static const struct rpc_call_ops nfsd4_cb_recall_ops = { +static const struct rpc_call_ops nfsd4_cb_ops = { .rpc_call_prepare = nfsd4_cb_prepare, - .rpc_call_done = nfsd4_cb_recall_done, - .rpc_release = nfsd4_cb_recall_release, + .rpc_call_done = nfsd4_cb_done, + .rpc_release = nfsd4_cb_release, }; int nfsd4_create_callback_queue(void) @@ -933,19 +888,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; - * instead, nfsd4_do_callback_rpc() will detect the killed + * instead, nfsd4_run_cb_null() will detect the killed * client, destroy the rpc client, and stop: */ - do_probe_callback(clp); + nfsd4_run_cb(&clp->cl_cb_null); flush_workqueue(callback_wq); } -static void nfsd4_release_cb(struct nfsd4_callback *cb) -{ - if (cb->cb_ops->rpc_release) - cb->cb_ops->rpc_release(cb); -} - /* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { @@ -1008,50 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } /* Yay, the callback channel's back! Restart any callbacks: */ list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - run_nfsd4_cb(cb); + queue_work(callback_wq, &cb->cb_work); } -static void nfsd4_do_callback_rpc(struct work_struct *w) +static void +nfsd4_run_cb_work(struct work_struct *work) { - struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); + struct nfsd4_callback *cb = + container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt) { /* Callback channel broken, or client killed; give up: */ - nfsd4_release_cb(cb); + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); return; } cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, - cb->cb_ops, cb); -} - -void nfsd4_init_callback(struct nfsd4_callback *cb) -{ - INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc); + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); } -void nfsd4_cb_recall(struct nfs4_delegation *dp) +void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { - struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_stid.sc_client; - - dp->dl_retries = 1; - cb->cb_op = dp; cb->cb_clp = clp; - cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; - - cb->cb_ops = &nfsd4_cb_recall_ops; - + cb->cb_ops = ops; + INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); INIT_LIST_HEAD(&cb->cb_per_client); cb->cb_done = true; +} - run_nfsd4_cb(&dp->dl_recall); +void nfsd4_run_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a0ab0a847d69..e1b3d3d472da 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) memset(&ent, 0, sizeof(ent)); /* Authentication name */ - if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); @@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) /* Name */ error = -EINVAL; len = qword_get(&buf, buf1, PAGE_SIZE); - if (len < 0) + if (len < 0 || len >= IDMAP_NAMESZ) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); - else if (len >= IDMAP_NAMESZ) - goto out; else memcpy(ent.name, buf1, sizeof(ent.name)); error = -ENOMEM; @@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; cache_put(&res->h, cd); - error = 0; out: kfree(buf1); - return error; } - static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { @@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1; - int error = -EINVAL; + int len, error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); @@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) memset(&ent, 0, sizeof(ent)); /* Authentication name */ - if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); @@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* Name */ - error = qword_get(&buf, buf1, PAGE_SIZE); - if (error <= 0 || error >= IDMAP_NAMESZ) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.name, buf1, sizeof(ent.name)); @@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) error = 0; out: kfree(buf1); - return (error); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8f029db5d271..cdeb3cfd6f32 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -177,7 +177,7 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src) fh_put(dst); dget(src->fh_dentry); if (src->fh_export) - cache_get(&src->fh_export->h); + exp_get(src->fh_export); *dst = *src; } @@ -385,8 +385,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nfsd4_has_session(cstate)) copy_clientid(&open->op_clientid, cstate->session); - nfs4_lock_state(); - /* check seqid for replay. set nfs4_owner */ resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open, nn); @@ -431,8 +429,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: status = nfs4_check_open_reclaim(&open->op_clientid, - cstate->minorversion, - nn); + cstate, nn); if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; @@ -461,19 +458,17 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, resfh, open); - WARN_ON(status && open->op_created); + WARN(status && open->op_created, + "nfsd4_process_open2 failed to open newly-created file! status=%u\n", + be32_to_cpu(status)); out: if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); kfree(resfh); } - nfsd4_cleanup_open_state(open, status); - if (open->op_openowner && !nfsd4_has_session(cstate)) - cstate->replay_owner = &open->op_openowner->oo_owner; + nfsd4_cleanup_open_state(cstate, open, status); nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -581,8 +576,12 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) __be32 verf[2]; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - verf[0] = (__be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__be32)nn->nfssvc_boot.tv_usec; + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec; memcpy(verifier->data, verf, sizeof(verifier->data)); } @@ -619,8 +618,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NF4LNK: status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - create->cr_linkname, create->cr_linklen, - &resfh, &create->cr_iattr); + create->cr_data, &resfh); break; case NF4BLK: @@ -909,8 +907,8 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat default: return nfserr_inval; } - exp_get(cstate->current_fh.fh_export); - sin->sin_exp = cstate->current_fh.fh_export; + + sin->sin_exp = exp_get(cstate->current_fh.fh_export); fh_put(&cstate->current_fh); return nfs_ok; } @@ -1015,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_seek *seek) +{ + int whence; + __be32 status; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &seek->seek_stateid, + RD_STATE, &file); + if (status) { + dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + return status; + } + + switch (seek->seek_whence) { + case NFS4_CONTENT_DATA: + whence = SEEK_DATA; + break; + case NFS4_CONTENT_HOLE: + whence = SEEK_HOLE; + break; + default: + status = nfserr_union_notsupp; + goto out; + } + + /* + * Note: This call does change file->f_pos, but nothing in NFSD + * should ever file->f_pos. + */ + seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + if (seek->seek_pos < 0) + status = nfserrno(seek->seek_pos); + else if (seek->seek_pos >= i_size_read(file_inode(file))) + seek->seek_eof = true; + +out: + fput(file); + return status; +} + /* This routine never returns NFS_OK! If there are no other errors, it * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the * attributes matched. VERIFY is implemented by mapping NFSERR_SAME @@ -1289,7 +1330,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - rqstp->rq_usedeferral = 0; + rqstp->rq_usedeferral = false; /* * According to RFC3010, this takes precedence over all other errors. @@ -1391,10 +1432,7 @@ encode_op: args->ops, args->opcnt, resp->opcnt, op->opnum, be32_to_cpu(status)); - if (cstate->replay_owner) { - nfs4_unlock_state(); - cstate->replay_owner = NULL; - } + nfsd4_cstate_clear_replay(cstate); /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) fput(op->u.read.rd_filp); @@ -1408,7 +1446,7 @@ encode_op: BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ - rqstp->rq_usedeferral = 1; + rqstp->rq_usedeferral = true; dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } @@ -1520,21 +1558,17 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) u32 maxcount = 0, rlen = 0; maxcount = svc_max_payload(rqstp); - rlen = op->u.read.rd_length; - - if (rlen > maxcount) - rlen = maxcount; + rlen = min(op->u.read.rd_length, maxcount); return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - u32 maxcount = svc_max_payload(rqstp); - u32 rlen = op->u.readdir.rd_maxcount; + u32 maxcount = 0, rlen = 0; - if (rlen > maxcount) - rlen = maxcount; + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.readdir.rd_maxcount, maxcount); return (op_encode_hdr_size + op_encode_verifier_maxsz + XDR_QUADLEN(rlen)) * sizeof(__be32); @@ -1890,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + + /* NFSv4.2 operations */ + [OP_SEEK] = { + .op_func = (nfsd4op_func)nfsd4_seek, + .op_name = "OP_SEEK", + }, }; int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c271f42604a..ea95a2bc21b5 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops { void (*create)(struct nfs4_client *); void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); - void (*grace_done)(struct nfsd_net *, time_t); + void (*grace_done)(struct nfsd_net *); }; /* Globals */ @@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) status = mnt_want_write_file(nn->rec_file); if (status) - return; + goto out_creds; dir = nn->rec_file->f_path.dentry; /* lock the parent */ @@ -228,6 +228,7 @@ out_unlock: user_recovery_dirname); } mnt_drop_write_file(nn->rec_file); +out_creds: nfs4_reset_creds(original_cred); } @@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) } static void -nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) +nfsd4_recdir_purge_old(struct nfsd_net *nn) { int status; @@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net) return status; } +static void +nfsd4_shutdown_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return; + fput(nn->rec_file); + nn->rec_file = NULL; +} static int nfs4_legacy_state_init(struct net *net) @@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net) int status; status = nfsd4_init_recdir(net); - if (!status) - status = nfsd4_recdir_load(net); if (status) - printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); + return status; + + status = nfsd4_recdir_load(net); + if (status) + nfsd4_shutdown_recdir(net); + return status; } @@ -546,21 +560,12 @@ err: } static void -nfsd4_shutdown_recdir(struct nfsd_net *nn) -{ - if (!nn->rec_file) - return; - fput(nn->rec_file); - nn->rec_file = NULL; -} - -static void nfsd4_legacy_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); - nfsd4_shutdown_recdir(nn); + nfsd4_shutdown_recdir(net); nfs4_legacy_state_shutdown(net); } @@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp) } static void -nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) +nfsd4_cld_grace_done(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; @@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) } cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; + cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); if (!ret) ret = cup->cu_msg.cm_status; @@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable, #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" +#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" +#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" static char * nfsd4_cltrack_legacy_topdir(void) @@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) return result; } +static char * +nfsd4_cltrack_client_has_session(struct nfs4_client *clp) +{ + int copied; + size_t len; + char *result; + + /* prefix + Y/N character + terminating NULL */ + len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", + clp->cl_minorversion ? 'Y' : 'N'); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_grace_start(time_t grace_start) +{ + int copied; + size_t len; + char *result; + + /* prefix + max width of int64_t string + terminating NULL */ + len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld", + grace_start); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + static int -nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) { - char *envp[2]; + char *envp[3]; char *argv[4]; int ret; @@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) dprintk("%s: cmd: %s\n", __func__, cmd); dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); - dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); + dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); + dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); - envp[0] = legacy; - envp[1] = NULL; + envp[0] = env0; + envp[1] = env1; + envp[2] = NULL; argv[0] = (char *)cltrack_prog; argv[1] = cmd; @@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen) } static int -nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) +nfsd4_umh_cltrack_init(struct net *net) { + int ret; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " "tracking in a container!\n"); return -EINVAL; } - return nfsd4_umh_cltrack_upcall("init", NULL, NULL); + + ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); + kfree(grace_start); + return ret; +} + +static void +nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) +{ + wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, + TASK_UNINTERRUPTIBLE); +} + +static void +nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) +{ + smp_mb__before_atomic(); + clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); } static void nfsd4_umh_cltrack_create(struct nfs4_client *clp) { - char *hexid; + char *hexid, *has_session, *grace_start; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* + * With v4.0 clients, there's little difference in outcome between a + * create and check operation, and we can end up calling into this + * function multiple times per client (once for each openowner). So, + * for v4.0 clients skip upcalling once the client has been recorded + * on stable storage. + * + * For v4.1+ clients, the outcome of the two operations is different, + * so we must ensure that we upcall for the create operation. v4.1+ + * clients call this on RECLAIM_COMPLETE though, so we should only end + * up doing a single create upcall per client. + */ + if (clp->cl_minorversion == 0 && + test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("create", hexid, NULL); + + has_session = nfsd4_cltrack_client_has_session(clp); + grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + nfsd4_cltrack_upcall_lock(clp); + if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(has_session); + kfree(grace_start); kfree(hexid); } @@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp) { char *hexid; + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("remove", hexid, NULL); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + kfree(hexid); } @@ -1230,30 +1348,45 @@ static int nfsd4_umh_cltrack_check(struct nfs4_client *clp) { int ret; - char *hexid, *legacy; + char *hexid, *has_session, *legacy; + + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } + + has_session = nfsd4_cltrack_client_has_session(clp); legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); - ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { + ret = 0; + } else { + ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); + if (ret == 0) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + nfsd4_cltrack_upcall_unlock(clp); + kfree(has_session); kfree(legacy); kfree(hexid); + return ret; } static void -nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, - time_t boot_time) +nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) { char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ - sprintf(timestr, "%ld", boot_time); + sprintf(timestr, "%ld", nn->boot_time); legacy = nfsd4_cltrack_legacy_topdir(); - nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); + nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); kfree(legacy); } @@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp) } void -nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) +nfsd4_record_grace_done(struct nfsd_net *nn) { if (nn->client_tracking_ops) - nn->client_tracking_ops->grace_done(nn, boot_time); + nn->client_tracking_ops->grace_done(nn); } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2204e1fe5725..e9c3afe4b5d3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -70,13 +70,11 @@ static u64 current_sessionid = 1; #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) /* forward declarations */ -static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); +static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); +static void nfs4_free_ol_stateid(struct nfs4_stid *stid); /* Locking: */ -/* Currently used for almost all code touching nfsv4 state: */ -static DEFINE_MUTEX(client_mutex); - /* * Currently used for the del_recall_lru and file hash table. In an * effort to decrease the scope of the client_mutex, this spinlock may @@ -84,31 +82,27 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(state_lock); +/* + * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for + * the refcount on the open stateid to drop. + */ +static DECLARE_WAIT_QUEUE_HEAD(close_wq); + static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; -void -nfs4_lock_state(void) -{ - mutex_lock(&client_mutex); -} - static void free_session(struct nfsd4_session *); +static struct nfsd4_callback_ops nfsd4_cb_recall_ops; + static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_flags & NFS4_SESSION_DEAD; } -void nfsd4_put_session(struct nfsd4_session *ses) -{ - if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) - free_session(ses); -} - static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) @@ -117,46 +111,17 @@ static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_b return nfs_ok; } -static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) -{ - if (is_session_dead(ses)) - return nfserr_badsession; - atomic_inc(&ses->se_ref); - return nfs_ok; -} - -void -nfs4_unlock_state(void) -{ - mutex_unlock(&client_mutex); -} - static bool is_client_expired(struct nfs4_client *clp) { return clp->cl_time == 0; } -static __be32 mark_client_expired_locked(struct nfs4_client *clp) -{ - if (atomic_read(&clp->cl_refcount)) - return nfserr_jukebox; - clp->cl_time = 0; - return nfs_ok; -} - -static __be32 mark_client_expired(struct nfs4_client *clp) +static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - __be32 ret; - spin_lock(&nn->client_lock); - ret = mark_client_expired_locked(clp); - spin_unlock(&nn->client_lock); - return ret; -} + lockdep_assert_held(&nn->client_lock); -static __be32 get_client_locked(struct nfs4_client *clp) -{ if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_refcount); @@ -197,13 +162,17 @@ renew_client(struct nfs4_client *clp) static void put_client_renew_locked(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + if (!atomic_dec_and_test(&clp->cl_refcount)) return; if (!is_client_expired(clp)) renew_client_locked(clp); } -void put_client_renew(struct nfs4_client *clp) +static void put_client_renew(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -214,6 +183,84 @@ void put_client_renew(struct nfs4_client *clp) spin_unlock(&nn->client_lock); } +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) +{ + __be32 status; + + if (is_session_dead(ses)) + return nfserr_badsession; + status = get_client_locked(ses->se_client); + if (status) + return status; + atomic_inc(&ses->se_ref); + return nfs_ok; +} + +static void nfsd4_put_session_locked(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) + free_session(ses); + put_client_renew_locked(clp); +} + +static void nfsd4_put_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + nfsd4_put_session_locked(ses); + spin_unlock(&nn->client_lock); +} + +static inline struct nfs4_stateowner * +nfs4_get_stateowner(struct nfs4_stateowner *sop) +{ + atomic_inc(&sop->so_count); + return sop; +} + +static int +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len); +} + +static struct nfs4_openowner * +find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + if (!so->so_is_open_owner) + continue; + if (same_owner_str(so, &open->op_owner)) + return openowner(nfs4_get_stateowner(so)); + } + return NULL; +} + +static struct nfs4_openowner * +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + spin_lock(&clp->cl_lock); + oo = find_openstateowner_str_locked(hashval, open, clp); + spin_unlock(&clp->cl_lock); + return oo; +} static inline u32 opaque_hashval(const void *ptr, int nbytes) @@ -236,10 +283,11 @@ static void nfsd4_free_file(struct nfs4_file *f) static inline void put_nfs4_file(struct nfs4_file *fi) { + might_lock(&state_lock); + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del(&fi->fi_hash); spin_unlock(&state_lock); - iput(fi->fi_inode); nfsd4_free_file(fi); } } @@ -250,7 +298,80 @@ get_nfs4_file(struct nfs4_file *fi) atomic_inc(&fi->fi_ref); } -static int num_delegations; +static struct file * +__nfs4_get_fd(struct nfs4_file *f, int oflag) +{ + if (f->fi_fds[oflag]) + return get_file(f->fi_fds[oflag]); + return NULL; +} + +static struct file * +find_writeable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_writeable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_writeable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct file *find_readable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_RDONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_readable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_readable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct file * +find_any_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = __nfs4_get_fd(f, O_RDWR); + if (!ret) { + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDONLY); + } + spin_unlock(&f->fi_lock); + return ret; +} + +static atomic_long_t num_delegations; unsigned long max_delegations; /* @@ -262,12 +383,11 @@ unsigned long max_delegations; #define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) #define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) -static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); - ret += clientid; return ret & OWNER_HASH_MASK; } @@ -275,75 +395,124 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -static unsigned int file_hashval(struct inode *ino) +static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) +{ + return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); +} + +static unsigned int file_hashval(struct knfsd_fh *fh) +{ + return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); +} + +static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) { - /* XXX: why are we hashing on inode pointer, anyway? */ - return hash_ptr(ino, FILE_HASH_BITS); + return fh1->fh_size == fh2->fh_size && + !memcmp(fh1->fh_base.fh_pad, + fh2->fh_base.fh_pad, + fh1->fh_size); } static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; -static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) +static void +__nfs4_file_get_access(struct nfs4_file *fp, u32 access) { - WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); - atomic_inc(&fp->fi_access[oflag]); + lockdep_assert_held(&fp->fi_lock); + + if (access & NFS4_SHARE_ACCESS_WRITE) + atomic_inc(&fp->fi_access[O_WRONLY]); + if (access & NFS4_SHARE_ACCESS_READ) + atomic_inc(&fp->fi_access[O_RDONLY]); } -static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) +static __be32 +nfs4_file_get_access(struct nfs4_file *fp, u32 access) { - if (oflag == O_RDWR) { - __nfs4_file_get_access(fp, O_RDONLY); - __nfs4_file_get_access(fp, O_WRONLY); - } else - __nfs4_file_get_access(fp, oflag); + lockdep_assert_held(&fp->fi_lock); + + /* Does this access mode make sense? */ + if (access & ~NFS4_SHARE_ACCESS_BOTH) + return nfserr_inval; + + /* Does it conflict with a deny mode already set? */ + if ((access & fp->fi_share_deny) != 0) + return nfserr_share_denied; + + __nfs4_file_get_access(fp, access); + return nfs_ok; } -static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) +static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) { - if (fp->fi_fds[oflag]) { - fput(fp->fi_fds[oflag]); - fp->fi_fds[oflag] = NULL; + /* Common case is that there is no deny mode. */ + if (deny) { + /* Does this deny mode make sense? */ + if (deny & ~NFS4_SHARE_DENY_BOTH) + return nfserr_inval; + + if ((deny & NFS4_SHARE_DENY_READ) && + atomic_read(&fp->fi_access[O_RDONLY])) + return nfserr_share_denied; + + if ((deny & NFS4_SHARE_DENY_WRITE) && + atomic_read(&fp->fi_access[O_WRONLY])) + return nfserr_share_denied; } + return nfs_ok; } static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { - if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, oflag); + might_lock(&fp->fi_lock); + + if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { + struct file *f1 = NULL; + struct file *f2 = NULL; + + swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) - nfs4_file_put_fd(fp, O_RDWR); + swap(f2, fp->fi_fds[O_RDWR]); + spin_unlock(&fp->fi_lock); + if (f1) + fput(f1); + if (f2) + fput(f2); } } -static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) +static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) { - if (oflag == O_RDWR) { - __nfs4_file_put_access(fp, O_RDONLY); + WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); + + if (access & NFS4_SHARE_ACCESS_WRITE) __nfs4_file_put_access(fp, O_WRONLY); - } else - __nfs4_file_put_access(fp, oflag); + if (access & NFS4_SHARE_ACCESS_READ) + __nfs4_file_put_access(fp, O_RDONLY); } -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct -kmem_cache *slab) +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab) { - struct idr *stateids = &cl->cl_stateids; struct nfs4_stid *stid; int new_id; - stid = kmem_cache_alloc(slab, GFP_KERNEL); + stid = kmem_cache_zalloc(slab, GFP_KERNEL); if (!stid) return NULL; - new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&cl->cl_lock); + new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT); + spin_unlock(&cl->cl_lock); + idr_preload_end(); if (new_id < 0) goto out_free; stid->sc_client = cl; - stid->sc_type = 0; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ - stid->sc_stateid.si_generation = 0; + atomic_set(&stid->sc_count, 1); /* * It shouldn't be a problem to reuse an opaque stateid value. @@ -360,9 +529,24 @@ out_free: return NULL; } -static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { - return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); + struct nfs4_stid *stid; + struct nfs4_ol_stateid *stp; + + stid = nfs4_alloc_stid(clp, stateid_slab); + if (!stid) + return NULL; + + stp = openlockstateid(stid); + stp->st_stid.sc_free = nfs4_free_ol_stateid; + return stp; +} + +static void nfs4_free_deleg(struct nfs4_stid *stid) +{ + kmem_cache_free(deleg_slab, stid); + atomic_long_dec(&num_delegations); } /* @@ -379,10 +563,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) * Each filter is 256 bits. We hash the filehandle to 32bit and use the * low 3 bytes as hash-table indices. * - * 'state_lock', which is always held when block_delegations() is called, + * 'blocked_delegations_lock', which is always taken in block_delegations(), * is used to manage concurrent access. Testing does not need the lock * except when swapping the two filters. */ +static DEFINE_SPINLOCK(blocked_delegations_lock); static struct bloom_pair { int entries, old_entries; time_t swap_time; @@ -398,7 +583,7 @@ static int delegation_blocked(struct knfsd_fh *fh) if (bd->entries == 0) return 0; if (seconds_since_boot() - bd->swap_time > 30) { - spin_lock(&state_lock); + spin_lock(&blocked_delegations_lock); if (seconds_since_boot() - bd->swap_time > 30) { bd->entries -= bd->old_entries; bd->old_entries = bd->entries; @@ -407,7 +592,7 @@ static int delegation_blocked(struct knfsd_fh *fh) bd->new = 1-bd->new; bd->swap_time = seconds_since_boot(); } - spin_unlock(&state_lock); + spin_unlock(&blocked_delegations_lock); } hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && @@ -430,76 +615,83 @@ static void block_delegations(struct knfsd_fh *fh) hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); __set_bit((hash>>8)&255, bd->set[bd->new]); __set_bit((hash>>16)&255, bd->set[bd->new]); if (bd->entries == 0) bd->swap_time = seconds_since_boot(); bd->entries += 1; + spin_unlock(&blocked_delegations_lock); } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) { struct nfs4_delegation *dp; + long n; dprintk("NFSD alloc_init_deleg\n"); - if (num_delegations > max_delegations) - return NULL; + n = atomic_long_inc_return(&num_delegations); + if (n < 0 || n > max_delegations) + goto out_dec; if (delegation_blocked(¤t_fh->fh_handle)) - return NULL; + goto out_dec; dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) - return dp; + goto out_dec; + + dp->dl_stid.sc_free = nfs4_free_deleg; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid * 0 anyway just for consistency and use 1: */ dp->dl_stid.sc_stateid.si_generation = 1; - num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_file = NULL; dp->dl_type = NFS4_OPEN_DELEGATE_READ; - fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); - dp->dl_time = 0; - atomic_set(&dp->dl_count, 1); - nfsd4_init_callback(&dp->dl_recall); + dp->dl_retries = 1; + nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, + &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); return dp; +out_dec: + atomic_long_dec(&num_delegations); + return NULL; } -static void remove_stid(struct nfs4_stid *s) +void +nfs4_put_stid(struct nfs4_stid *s) { - struct idr *stateids = &s->sc_client->cl_stateids; + struct nfs4_file *fp = s->sc_file; + struct nfs4_client *clp = s->sc_client; - idr_remove(stateids, s->sc_stateid.si_opaque.so_id); -} + might_lock(&clp->cl_lock); -static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s) -{ - kmem_cache_free(slab, s); -} - -void -nfs4_put_delegation(struct nfs4_delegation *dp) -{ - if (atomic_dec_and_test(&dp->dl_count)) { - nfs4_free_stid(deleg_slab, &dp->dl_stid); - num_delegations--; + if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + wake_up_all(&close_wq); + return; } + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + spin_unlock(&clp->cl_lock); + s->sc_free(s); + if (fp) + put_nfs4_file(fp); } static void nfs4_put_deleg_lease(struct nfs4_file *fp) { - if (!fp->fi_lease) - return; - if (atomic_dec_and_test(&fp->fi_delegees)) { - vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); - fp->fi_lease = NULL; - fput(fp->fi_deleg_file); - fp->fi_deleg_file = NULL; + struct file *filp = NULL; + + spin_lock(&fp->fi_lock); + if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) + swap(filp, fp->fi_deleg_file); + spin_unlock(&fp->fi_lock); + + if (filp) { + vfs_setlease(filp, F_UNLCK, NULL, NULL); + fput(filp); } } @@ -512,54 +704,55 @@ static void hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + atomic_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); } -/* Called under the state lock. */ static void -unhash_delegation(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp) { - spin_lock(&state_lock); - list_del_init(&dp->dl_perclnt); - list_del_init(&dp->dl_perfile); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&state_lock); - if (dp->dl_file) { - nfs4_put_deleg_lease(dp->dl_file); - put_nfs4_file(dp->dl_file); - dp->dl_file = NULL; - } -} - + struct nfs4_file *fp = dp->dl_stid.sc_file; + lockdep_assert_held(&state_lock); -static void destroy_revoked_delegation(struct nfs4_delegation *dp) -{ + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + /* Ensure that deleg break won't try to requeue it */ + ++dp->dl_time; + spin_lock(&fp->fi_lock); + list_del_init(&dp->dl_perclnt); list_del_init(&dp->dl_recall_lru); - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + list_del_init(&dp->dl_perfile); + spin_unlock(&fp->fi_lock); } static void destroy_delegation(struct nfs4_delegation *dp) { - unhash_delegation(dp); - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + spin_lock(&state_lock); + unhash_delegation_locked(dp); + spin_unlock(&state_lock); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); } static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; + WARN_ON(!list_empty(&dp->dl_recall_lru)); + + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + if (clp->cl_minorversion == 0) - destroy_delegation(dp); + nfs4_put_stid(&dp->dl_stid); else { - unhash_delegation(dp); dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); + spin_unlock(&clp->cl_lock); } } @@ -607,57 +800,62 @@ bmap_to_share_mode(unsigned long bmap) { return access; } -static bool -test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - unsigned int access, deny; - - access = bmap_to_share_mode(stp->st_access_bmap); - deny = bmap_to_share_mode(stp->st_deny_bmap); - if ((access & open->op_share_deny) || (deny & open->op_share_access)) - return false; - return true; -} - /* set share access for a given stateid */ static inline void set_access(u32 access, struct nfs4_ol_stateid *stp) { - __set_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; } /* clear share access for a given stateid */ static inline void clear_access(u32 access, struct nfs4_ol_stateid *stp) { - __clear_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; } /* test whether a given stateid has access */ static inline bool test_access(u32 access, struct nfs4_ol_stateid *stp) { - return test_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); } /* set share deny for a given stateid */ static inline void -set_deny(u32 access, struct nfs4_ol_stateid *stp) +set_deny(u32 deny, struct nfs4_ol_stateid *stp) { - __set_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; } /* clear share deny for a given stateid */ static inline void -clear_deny(u32 access, struct nfs4_ol_stateid *stp) +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) { - __clear_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; } /* test whether a given stateid is denying specific access */ static inline bool -test_deny(u32 access, struct nfs4_ol_stateid *stp) +test_deny(u32 deny, struct nfs4_ol_stateid *stp) { - return test_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); } static int nfs4_access_to_omode(u32 access) @@ -674,138 +872,283 @@ static int nfs4_access_to_omode(u32 access) return O_RDONLY; } +/* + * A stateid that had a deny mode associated with it is being released + * or downgraded. Recalculate the deny mode on the file. + */ +static void +recalculate_deny_mode(struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *stp; + + spin_lock(&fp->fi_lock); + fp->fi_share_deny = 0; + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + spin_unlock(&fp->fi_lock); +} + +static void +reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + int i; + bool change = false; + + for (i = 1; i < 4; i++) { + if ((i & deny) != i) { + change = true; + clear_deny(i, stp); + } + } + + /* Recalculate per-file deny mode if there was a change */ + if (change) + recalculate_deny_mode(stp->st_stid.sc_file); +} + /* release all access and file references for a given stateid */ static void release_all_access(struct nfs4_ol_stateid *stp) { int i; + struct nfs4_file *fp = stp->st_stid.sc_file; + + if (fp && stp->st_deny_bmap != 0) + recalculate_deny_mode(fp); for (i = 1; i < 4; i++) { if (test_access(i, stp)) - nfs4_file_put_access(stp->st_file, - nfs4_access_to_omode(i)); + nfs4_file_put_access(stp->st_stid.sc_file, i); clear_access(i, stp); } } -static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { + struct nfs4_client *clp = sop->so_client; + + might_lock(&clp->cl_lock); + + if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) + return; + sop->so_ops->so_unhash(sop); + spin_unlock(&clp->cl_lock); + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + +static void unhash_ol_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp = stp->st_stid.sc_file; + + lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + + spin_lock(&fp->fi_lock); list_del(&stp->st_perfile); + spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); } -static void close_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { + struct nfs4_ol_stateid *stp = openlockstateid(stid); + release_all_access(stp); - put_nfs4_file(stp->st_file); - stp->st_file = NULL; + if (stp->st_stateowner) + nfs4_put_stateowner(stp->st_stateowner); + kmem_cache_free(stateid_slab, stid); } -static void free_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { - remove_stid(&stp->st_stid); - nfs4_free_stid(stateid_slab, &stp->st_stid); + struct nfs4_ol_stateid *stp = openlockstateid(stid); + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + struct file *file; + + file = find_any_file(stp->st_stid.sc_file); + if (file) + filp_close(file, (fl_owner_t)lo); + nfs4_free_ol_stateid(stid); } -static void release_lock_stateid(struct nfs4_ol_stateid *stp) +/* + * Put the persistent reference to an already unhashed generic stateid, while + * holding the cl_lock. If it's the last reference, then put it onto the + * reaplist for later destruction. + */ +static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) { - struct file *file; + struct nfs4_stid *s = &stp->st_stid; + struct nfs4_client *clp = s->sc_client; + + lockdep_assert_held(&clp->cl_lock); + + WARN_ON_ONCE(!list_empty(&stp->st_locks)); + + if (!atomic_dec_and_test(&s->sc_count)) { + wake_up_all(&close_wq); + return; + } + + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + list_add(&stp->st_locks, reaplist); +} - unhash_generic_stateid(stp); +static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + + lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + + list_del_init(&stp->st_locks); + unhash_ol_stateid(stp); unhash_stid(&stp->st_stid); - file = find_any_file(stp->st_file); - if (file) - locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); - close_generic_stateid(stp); - free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_lockowner *lo) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_ol_stateid *stp; + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); - list_del(&lo->lo_owner.so_strhash); - list_del(&lo->lo_perstateid); - list_del(&lo->lo_owner_ino_hash); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - release_lock_stateid(stp); - } + spin_lock(&oo->oo_owner.so_client->cl_lock); + unhash_lock_stateid(stp); + spin_unlock(&oo->oo_owner.so_client->cl_lock); + nfs4_put_stid(&stp->st_stid); } -static void nfs4_free_lockowner(struct nfs4_lockowner *lo) +static void unhash_lockowner_locked(struct nfs4_lockowner *lo) { - kfree(lo->lo_owner.so_owner.data); - kmem_cache_free(lockowner_slab, lo); + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&lo->lo_owner.so_strhash); +} + +/* + * Free a list of generic stateids that were collected earlier after being + * fully unhashed. + */ +static void +free_ol_stateid_reaplist(struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_file *fp; + + might_sleep(); + + while (!list_empty(reaplist)) { + stp = list_first_entry(reaplist, struct nfs4_ol_stateid, + st_locks); + list_del(&stp->st_locks); + fp = stp->st_stid.sc_file; + stp->st_stid.sc_free(&stp->st_stid); + if (fp) + put_nfs4_file(fp); + } } static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(lo); - nfs4_free_lockowner(lo); + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *stp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); } -static void -release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) +static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, + struct list_head *reaplist) { - struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *stp; - while (!list_empty(&open_stp->st_lockowners)) { - lo = list_entry(open_stp->st_lockowners.next, - struct nfs4_lockowner, lo_perstateid); - release_lockowner(lo); + while (!list_empty(&open_stp->st_locks)) { + stp = list_entry(open_stp->st_locks.next, + struct nfs4_ol_stateid, st_locks); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, reaplist); } } -static void unhash_open_stateid(struct nfs4_ol_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) { - unhash_generic_stateid(stp); - release_stateid_lockowners(stp); - close_generic_stateid(stp); + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + + unhash_ol_stateid(stp); + release_open_stateid_locks(stp, reaplist); } static void release_open_stateid(struct nfs4_ol_stateid *stp) { - unhash_open_stateid(stp); - free_generic_stateid(stp); + LIST_HEAD(reaplist); + + spin_lock(&stp->st_stid.sc_client->cl_lock); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&stp->st_stid.sc_client->cl_lock); + free_ol_stateid_reaplist(&reaplist); } -static void unhash_openowner(struct nfs4_openowner *oo) +static void unhash_openowner_locked(struct nfs4_openowner *oo) { - struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; - list_del(&oo->oo_owner.so_strhash); - list_del(&oo->oo_perclient); - while (!list_empty(&oo->oo_owner.so_stateids)) { - stp = list_first_entry(&oo->oo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - release_open_stateid(stp); - } + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&oo->oo_owner.so_strhash); + list_del_init(&oo->oo_perclient); } static void release_last_closed_stateid(struct nfs4_openowner *oo) { - struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, + nfsd_net_id); + struct nfs4_ol_stateid *s; + spin_lock(&nn->client_lock); + s = oo->oo_last_closed_stid; if (s) { - free_generic_stateid(s); + list_del_init(&oo->oo_close_lru); oo->oo_last_closed_stid = NULL; } -} - -static void nfs4_free_openowner(struct nfs4_openowner *oo) -{ - kfree(oo->oo_owner.so_owner.data); - kmem_cache_free(openowner_slab, oo); + spin_unlock(&nn->client_lock); + if (s) + nfs4_put_stid(&s->st_stid); } static void release_openowner(struct nfs4_openowner *oo) { - unhash_openowner(oo); - list_del(&oo->oo_close_lru); + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_openowner_locked(oo); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); release_last_closed_stateid(oo); - nfs4_free_openowner(oo); + nfs4_put_stateowner(&oo->oo_owner); } static inline int @@ -842,7 +1185,7 @@ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) return; if (!seqid_mutating_err(ntohl(nfserr))) { - cstate->replay_owner = NULL; + nfsd4_cstate_clear_replay(cstate); return; } if (!so) @@ -1030,10 +1373,8 @@ static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, str if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); - if (conn->cn_flags & NFS4_CDFC4_BACK) { - /* callback channel may be back up */ - nfsd4_probe_callback(ses->se_client); - } + /* We may have gained or lost a callback channel: */ + nfsd4_probe_callback_sync(ses->se_client); } static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) @@ -1073,9 +1414,6 @@ static void __free_session(struct nfsd4_session *ses) static void free_session(struct nfsd4_session *ses) { - struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); - - lockdep_assert_held(&nn->client_lock); nfsd4_del_conns(ses); nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); @@ -1097,12 +1435,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru new->se_cb_sec = cses->cb_sec; atomic_set(&new->se_ref, 0); idx = hash_sessionid(&new->se_sessionid); - spin_lock(&nn->client_lock); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - spin_unlock(&nn->client_lock); if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); @@ -1120,12 +1456,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru /* caller must hold client_lock */ static struct nfsd4_session * -find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) +__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) { struct nfsd4_session *elem; int idx; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); /* Search in the appropriate list */ @@ -1140,10 +1478,33 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) return NULL; } +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, + __be32 *ret) +{ + struct nfsd4_session *session; + __be32 status = nfserr_badsession; + + session = __find_in_sessionid_hashtbl(sessionid, net); + if (!session) + goto out; + status = nfsd4_get_session_locked(session); + if (status) + session = NULL; +out: + *ret = status; + return session; +} + /* caller must hold client_lock */ static void unhash_session(struct nfsd4_session *ses) { + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + list_del(&ses->se_hash); spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); @@ -1169,15 +1530,20 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) static struct nfs4_client *alloc_client(struct xdr_netobj name) { struct nfs4_client *clp; + int i; clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); if (clp == NULL) return NULL; clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); - if (clp->cl_name.data == NULL) { - kfree(clp); - return NULL; - } + if (clp->cl_name.data == NULL) + goto err_no_name; + clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * + OWNER_HASH_SIZE, GFP_KERNEL); + if (!clp->cl_ownerstr_hashtbl) + goto err_no_hashtbl; + for (i = 0; i < OWNER_HASH_SIZE; i++) + INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); clp->cl_name.len = name.len; INIT_LIST_HEAD(&clp->cl_sessions); idr_init(&clp->cl_stateids); @@ -1192,14 +1558,16 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; +err_no_hashtbl: + kfree(clp->cl_name.data); +err_no_name: + kfree(clp); + return NULL; } static void free_client(struct nfs4_client *clp) { - struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); - - lockdep_assert_held(&nn->client_lock); while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, @@ -1210,18 +1578,32 @@ free_client(struct nfs4_client *clp) } rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); + kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); kfree(clp); } /* must be called under the client_lock */ -static inline void +static void unhash_client_locked(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_session *ses; - list_del(&clp->cl_lru); + lockdep_assert_held(&nn->client_lock); + + /* Mark the client as expired! */ + clp->cl_time = 0; + /* Make it invisible */ + if (!list_empty(&clp->cl_idhash)) { + list_del_init(&clp->cl_idhash); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + rb_erase(&clp->cl_namenode, &nn->conf_name_tree); + else + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + } + list_del_init(&clp->cl_lru); spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) list_del_init(&ses->se_hash); @@ -1229,53 +1611,72 @@ unhash_client_locked(struct nfs4_client *clp) } static void -destroy_client(struct nfs4_client *clp) +unhash_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_refcount)) + return nfserr_jukebox; + unhash_client_locked(clp); + return nfs_ok; +} + +static void +__destroy_client(struct nfs4_client *clp) { struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - list_del_init(&dp->dl_perclnt); - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - destroy_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); } - list_splice_init(&clp->cl_revoked, &reaplist); - while (!list_empty(&reaplist)) { + while (!list_empty(&clp->cl_revoked)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - destroy_revoked_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); - list_del(&clp->cl_idhash); - if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) - rb_erase(&clp->cl_namenode, &nn->conf_name_tree); - else - rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); - spin_lock(&nn->client_lock); - unhash_client_locked(clp); - WARN_ON_ONCE(atomic_read(&clp->cl_refcount)); free_client(clp); - spin_unlock(&nn->client_lock); +} + +static void +destroy_client(struct nfs4_client *clp) +{ + unhash_client(clp); + __destroy_client(clp); } static void expire_client(struct nfs4_client *clp) { + unhash_client(clp); nfsd4_client_record_remove(clp); - destroy_client(clp); + __destroy_client(clp); } static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) @@ -1408,25 +1809,28 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); } -static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) +static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) { - static u32 current_clientid = 1; + __be32 verf[2]; - clp->cl_clientid.cl_boot = nn->boot_time; - clp->cl_clientid.cl_id = current_clientid++; + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)get_seconds(); + verf[1] = (__force __be32)nn->clientid_counter; + memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } -static void gen_confirm(struct nfs4_client *clp) +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { - __be32 verf[2]; - static u32 i; - - verf[0] = (__be32)get_seconds(); - verf[1] = (__be32)i++; - memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); + clp->cl_clientid.cl_boot = nn->boot_time; + clp->cl_clientid.cl_id = nn->clientid_counter++; + gen_confirm(clp, nn); } -static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) +static struct nfs4_stid * +find_stateid_locked(struct nfs4_client *cl, stateid_t *t) { struct nfs4_stid *ret; @@ -1436,16 +1840,21 @@ static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) return ret; } -static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +static struct nfs4_stid * +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) { struct nfs4_stid *s; - s = find_stateid(cl, t); - if (!s) - return NULL; - if (typemask & s->sc_type) - return s; - return NULL; + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, t); + if (s != NULL) { + if (typemask & s->sc_type) + atomic_inc(&s->sc_count); + else + s = NULL; + } + spin_unlock(&cl->cl_lock); + return s; } static struct nfs4_client *create_client(struct xdr_netobj name, @@ -1455,7 +1864,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); - struct nfsd_net *nn = net_generic(net, nfsd_net_id); clp = alloc_client(name); if (clp == NULL) @@ -1463,17 +1871,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name, ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { - spin_lock(&nn->client_lock); free_client(clp); - spin_unlock(&nn->client_lock); return NULL; } - nfsd4_init_callback(&clp->cl_cb_null); + nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); - gen_confirm(clp); clp->cl_cb_session = NULL; clp->net = net; return clp; @@ -1525,11 +1930,13 @@ add_to_unconfirmed(struct nfs4_client *clp) unsigned int idhashval; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); add_clp_to_name_tree(clp, &nn->unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); - renew_client(clp); + renew_client_locked(clp); } static void @@ -1538,12 +1945,14 @@ move_to_confirmed(struct nfs4_client *clp) unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); - renew_client(clp); + renew_client_locked(clp); } static struct nfs4_client * @@ -1556,7 +1965,7 @@ find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; - renew_client(clp); + renew_client_locked(clp); return clp; } } @@ -1568,6 +1977,7 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->conf_id_hashtbl; + lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } @@ -1576,6 +1986,7 @@ find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->unconf_id_hashtbl; + lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } @@ -1587,12 +1998,14 @@ static bool clp_used_exchangeid(struct nfs4_client *clp) static struct nfs4_client * find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { + lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->conf_name_tree); } static struct nfs4_client * find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { + lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->unconf_name_tree); } @@ -1642,7 +2055,7 @@ out_err: /* * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ -void +static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { struct xdr_buf *buf = resp->xdr.buf; @@ -1758,7 +2171,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_exchange_id *exid) { - struct nfs4_client *unconf, *conf, *new; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; __be32 status; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; @@ -1787,8 +2201,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_encr_alg_unsupp; } + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + /* Cases below refer to rfc 5661 section 18.35.4: */ - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); @@ -1813,7 +2231,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; - new = conf; goto out_copy; } if (!creds_match) { /* case 3 */ @@ -1821,15 +2238,14 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_clid_inuse; goto out; } - expire_client(conf); goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; - new = conf; goto out_copy; } /* case 5, client reboot */ + conf = NULL; goto out_new; } @@ -1840,33 +2256,38 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ - expire_client(unconf); + unhash_client_locked(unconf); /* case 1 (normal case) */ out_new: - new = create_client(exid->clname, rqstp, &verf); - if (new == NULL) { - status = nfserr_jukebox; - goto out; + if (conf) { + status = mark_client_expired_locked(conf); + if (status) + goto out; } new->cl_minorversion = cstate->minorversion; new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); + swap(new, conf); out_copy: - exid->clientid.cl_boot = new->cl_clientid.cl_boot; - exid->clientid.cl_id = new->cl_clientid.cl_id; + exid->clientid.cl_boot = conf->cl_clientid.cl_boot; + exid->clientid.cl_id = conf->cl_clientid.cl_id; - exid->seqid = new->cl_cs_slot.sl_seqid + 1; - nfsd4_set_ex_flags(new, exid); + exid->seqid = conf->cl_cs_slot.sl_seqid + 1; + nfsd4_set_ex_flags(conf, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", - new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); + conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (new) + expire_client(new); + if (unconf) + expire_client(unconf); return status; } @@ -2010,6 +2431,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, { struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; struct nfsd4_clid_slot *cs_slot = NULL; @@ -2035,7 +2457,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (!conn) goto out_free_session; - nfs4_lock_state(); + spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); WARN_ON_ONCE(conf && unconf); @@ -2054,7 +2476,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } } else if (unconf) { - struct nfs4_client *old; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { status = nfserr_clid_inuse; @@ -2072,10 +2493,11 @@ nfsd4_create_session(struct svc_rqst *rqstp, } old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { - status = mark_client_expired(old); - if (status) + status = mark_client_expired_locked(old); + if (status) { + old = NULL; goto out_free_conn; - expire_client(old); + } } move_to_confirmed(unconf); conf = unconf; @@ -2091,20 +2513,27 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_RDMA; init_session(rqstp, new, conf, cr_ses); - nfsd4_init_conn(rqstp, conn, new); + nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; - /* cache solo and embedded create sessions under the state lock */ + /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + /* init connection and backchannel */ + nfsd4_init_conn(rqstp, conn, new); + nfsd4_put_session(new); + if (old) + expire_client(old); return status; out_free_conn: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); free_conn(conn); + if (old) + expire_client(old); out_free_session: __free_session(new); out_release_drc_mem: @@ -2152,17 +2581,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, __be32 status; struct nfsd4_conn *conn; struct nfsd4_session *session; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; - nfs4_lock_state(); spin_lock(&nn->client_lock); - session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); + session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); spin_unlock(&nn->client_lock); - status = nfserr_badsession; if (!session) - goto out; + goto out_no_session; status = nfserr_wrong_cred; if (!mach_creds_match(session->se_client, rqstp)) goto out; @@ -2176,7 +2604,8 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, nfsd4_init_conn(rqstp, conn, session); status = nfs_ok; out: - nfs4_unlock_state(); + nfsd4_put_session(session); +out_no_session: return status; } @@ -2195,9 +2624,9 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; - struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); + struct net *net = SVC_NET(r); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfs4_lock_state(); status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { if (!nfsd4_last_compound_op(r)) @@ -2206,14 +2635,12 @@ nfsd4_destroy_session(struct svc_rqst *r, } dump_sessionid(__func__, &sessionid->sessionid); spin_lock(&nn->client_lock); - ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); - status = nfserr_badsession; + ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; if (!mach_creds_match(ses->se_client, r)) - goto out_client_lock; - nfsd4_get_session_locked(ses); + goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) goto out_put_session; @@ -2225,11 +2652,10 @@ nfsd4_destroy_session(struct svc_rqst *r, spin_lock(&nn->client_lock); status = nfs_ok; out_put_session: - nfsd4_put_session(ses); + nfsd4_put_session_locked(ses); out_client_lock: spin_unlock(&nn->client_lock); out: - nfs4_unlock_state(); return status; } @@ -2300,7 +2726,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_conn *conn; __be32 status; int buflen; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (resp->opcnt != 1) return nfserr_sequence_pos; @@ -2314,17 +2741,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, return nfserr_jukebox; spin_lock(&nn->client_lock); - status = nfserr_badsession; - session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); + session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); if (!session) goto out_no_session; clp = session->se_client; - status = get_client_locked(clp); - if (status) - goto out_no_session; - status = nfsd4_get_session_locked(session); - if (status) - goto out_put_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) @@ -2354,6 +2774,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, goto out_put_session; cstate->slot = slot; cstate->session = session; + cstate->clp = clp; /* Return the cached reply status and set cstate->status * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); @@ -2388,6 +2809,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, cstate->slot = slot; cstate->session = session; + cstate->clp = clp; out: switch (clp->cl_cb_state) { @@ -2408,31 +2830,48 @@ out_no_session: spin_unlock(&nn->client_lock); return status; out_put_session: - nfsd4_put_session(session); -out_put_client: - put_client_renew_locked(clp); + nfsd4_put_session_locked(session); goto out_no_session; } +void +nfsd4_sequence_done(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compound_state *cs = &resp->cstate; + + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; + } + /* Drop session reference that was taken in nfsd4_sequence() */ + nfsd4_put_session(cs->session); + } else if (cs->clp) + put_client_renew(cs->clp); +} + __be32 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) { - struct nfs4_client *conf, *unconf, *clp; + struct nfs4_client *conf, *unconf; + struct nfs4_client *clp = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - nfs4_lock_state(); + spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); WARN_ON_ONCE(conf && unconf); if (conf) { - clp = conf; - if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } + status = mark_client_expired_locked(conf); + if (status) + goto out; + clp = conf; } else if (unconf) clp = unconf; else { @@ -2440,12 +2879,15 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta goto out; } if (!mach_creds_match(clp, rqstp)) { + clp = NULL; status = nfserr_wrong_cred; goto out; } - expire_client(clp); + unhash_client_locked(clp); out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (clp) + expire_client(clp); return status; } @@ -2464,7 +2906,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta return nfs_ok; } - nfs4_lock_state(); status = nfserr_complete_already; if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->session->se_client->cl_flags)) @@ -2484,7 +2925,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfs_ok; nfsd4_client_record_create(cstate->session->se_client); out: - nfs4_unlock_state(); return status; } @@ -2494,12 +2934,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; - struct nfs4_client *conf, *unconf, *new; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + new = create_client(clname, rqstp, &clverifier); + if (new == NULL) + return nfserr_jukebox; /* Cases below refer to rfc 3530 section 14.2.33: */ - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf) { /* case 0: */ @@ -2517,11 +2961,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) - expire_client(unconf); - status = nfserr_jukebox; - new = create_client(clname, rqstp, &clverifier); - if (new == NULL) - goto out; + unhash_client_locked(unconf); if (conf && same_verf(&conf->cl_verifier, &clverifier)) /* case 1: probable callback update */ copy_clid(new, conf); @@ -2533,9 +2973,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); + new = NULL; status = nfs_ok; out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (new) + free_client(new); + if (unconf) + expire_client(unconf); return status; } @@ -2546,6 +2991,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm) { struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; @@ -2553,8 +2999,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client(clid, false, nn); unconf = find_unconfirmed_client(clid, false, nn); /* @@ -2578,22 +3024,30 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } status = nfs_ok; if (conf) { /* case 1: callback update */ + old = unconf; + unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); - nfsd4_probe_callback(conf); - expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ - conf = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (conf) { - status = mark_client_expired(conf); - if (status) + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = mark_client_expired_locked(old); + if (status) { + old = NULL; goto out; - expire_client(conf); + } } move_to_confirmed(unconf); - nfsd4_probe_callback(unconf); + conf = unconf; } + get_client_locked(conf); + spin_unlock(&nn->client_lock); + nfsd4_probe_callback(conf); + spin_lock(&nn->client_lock); + put_client_renew_locked(conf); out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (old) + expire_client(old); return status; } @@ -2603,21 +3057,23 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) { - unsigned int hashval = file_hashval(ino); + unsigned int hashval = file_hashval(fh); + + lockdep_assert_held(&state_lock); atomic_set(&fp->fi_ref, 1); + spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); + fh_copy_shallow(&fp->fi_fhandle, fh); + fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; - fp->fi_lease = NULL; + fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&state_lock); hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&state_lock); } void @@ -2673,6 +3129,27 @@ static void init_nfs4_replay(struct nfs4_replay *rp) rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; + mutex_init(&rp->rp_mutex); +} + +static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) +{ + if (!nfsd4_has_session(cstate)) { + mutex_lock(&so->so_replay.rp_mutex); + cstate->replay_owner = nfs4_get_stateowner(so); + } +} + +void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (so != NULL) { + cstate->replay_owner = NULL; + mutex_unlock(&so->so_replay.rp_mutex); + nfs4_put_stateowner(so); + } } static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) @@ -2693,111 +3170,171 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj INIT_LIST_HEAD(&sop->so_stateids); sop->so_client = clp; init_nfs4_replay(&sop->so_replay); + atomic_set(&sop->so_count, 1); return sop; } static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&clp->cl_lock); - list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } +static void nfs4_unhash_openowner(struct nfs4_stateowner *so) +{ + unhash_openowner_locked(openowner(so)); +} + +static void nfs4_free_openowner(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo = openowner(so); + + kmem_cache_free(openowner_slab, oo); +} + +static const struct nfs4_stateowner_operations openowner_ops = { + .so_unhash = nfs4_unhash_openowner, + .so_free = nfs4_free_openowner, +}; + static struct nfs4_openowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { - struct nfs4_openowner *oo; +alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_openowner *oo, *ret; oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); if (!oo) return NULL; + oo->oo_owner.so_ops = &openowner_ops; oo->oo_owner.so_is_open_owner = 1; oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_flags = NFS4_OO_NEW; + oo->oo_flags = 0; + if (nfsd4_has_session(cstate)) + oo->oo_flags |= NFS4_OO_CONFIRMED; oo->oo_time = 0; oo->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&oo->oo_close_lru); - hash_openowner(oo, clp, strhashval); + spin_lock(&clp->cl_lock); + ret = find_openstateowner_str_locked(strhashval, open, clp); + if (ret == NULL) { + hash_openowner(oo, clp, strhashval); + ret = oo; + } else + nfs4_free_openowner(&oo->oo_owner); + spin_unlock(&clp->cl_lock); return oo; } static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; + atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; - INIT_LIST_HEAD(&stp->st_lockowners); - list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); - list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_stateowner = &oo->oo_owner; + INIT_LIST_HEAD(&stp->st_locks); + stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); - stp->st_file = fp; + stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - set_access(open->op_share_access, stp); - set_deny(open->op_share_deny, stp); stp->st_openstp = NULL; + spin_lock(&oo->oo_owner.so_client->cl_lock); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); + spin_unlock(&oo->oo_owner.so_client->cl_lock); } +/* + * In the 4.0 case we need to keep the owners around a little while to handle + * CLOSE replay. We still do need to release any file access that is held by + * them before returning however. + */ static void -move_to_close_lru(struct nfs4_openowner *oo, struct net *net) +move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) { - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfs4_ol_stateid *last; + struct nfs4_openowner *oo = openowner(s->st_stateowner); + struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, + nfsd_net_id); dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); + /* + * We know that we hold one reference via nfsd4_close, and another + * "persistent" reference for the client. If the refcount is higher + * than 2, then there are still calls in progress that are using this + * stateid. We can't put the sc_file reference until they are finished. + * Wait for the refcount to drop to 2. Since it has been unhashed, + * there should be no danger of the refcount going back up again at + * this point. + */ + wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2); + + release_all_access(s); + if (s->st_stid.sc_file) { + put_nfs4_file(s->st_stid.sc_file); + s->st_stid.sc_file = NULL; + } + + spin_lock(&nn->client_lock); + last = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = s; list_move_tail(&oo->oo_close_lru, &nn->close_lru); oo->oo_time = get_seconds(); + spin_unlock(&nn->client_lock); + if (last) + nfs4_put_stid(&last->st_stid); } -static int -same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, - clientid_t *clid) +/* search file_hashtbl[] for file */ +static struct nfs4_file * +find_file_locked(struct knfsd_fh *fh) { - return (sop->so_owner.len == owner->len) && - 0 == memcmp(sop->so_owner.data, owner->data, owner->len) && - (sop->so_client->cl_clientid.cl_id == clid->cl_id); -} + unsigned int hashval = file_hashval(fh); + struct nfs4_file *fp; -static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, - bool sessions, struct nfsd_net *nn) -{ - struct nfs4_stateowner *so; - struct nfs4_openowner *oo; - struct nfs4_client *clp; + lockdep_assert_held(&state_lock); - list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) { - if (!so->so_is_open_owner) - continue; - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { - oo = openowner(so); - clp = oo->oo_owner.so_client; - if ((bool)clp->cl_minorversion != sessions) - return NULL; - renew_client(oo->oo_owner.so_client); - return oo; + hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + if (nfsd_fh_match(&fp->fi_fhandle, fh)) { + get_nfs4_file(fp); + return fp; } } return NULL; } -/* search file_hashtbl[] for file */ static struct nfs4_file * -find_file(struct inode *ino) +find_file(struct knfsd_fh *fh) { - unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; spin_lock(&state_lock); - hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { - if (fp->fi_inode == ino) { - get_nfs4_file(fp); - spin_unlock(&state_lock); - return fp; - } + fp = find_file_locked(fh); + spin_unlock(&state_lock); + return fp; +} + +static struct nfs4_file * +find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + + spin_lock(&state_lock); + fp = find_file_locked(fh); + if (fp == NULL) { + nfsd4_init_file(new, fh); + fp = new; } spin_unlock(&state_lock); - return NULL; + + return fp; } /* @@ -2807,63 +3344,109 @@ find_file(struct inode *ino) static __be32 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { - struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_ol_stateid *stp; - __be32 ret; + __be32 ret = nfs_ok; - fp = find_file(ino); + fp = find_file(¤t_fh->fh_handle); if (!fp) - return nfs_ok; - ret = nfserr_locked; - /* Search for conflicting share reservations */ - list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { - if (test_deny(deny_type, stp) || - test_deny(NFS4_SHARE_DENY_BOTH, stp)) - goto out; - } - ret = nfs_ok; -out: + return ret; + /* Check for conflicting share reservations */ + spin_lock(&fp->fi_lock); + if (fp->fi_share_deny & deny_type) + ret = nfserr_locked; + spin_unlock(&fp->fi_lock); put_nfs4_file(fp); return ret; } -static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { - struct nfs4_client *clp = dp->dl_stid.sc_client; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfs4_delegation *dp = cb_to_delegation(cb); + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - lockdep_assert_held(&state_lock); - /* We're assuming the state code never drops its reference - * without first removing the lease. Since we're in this lease - * callback (and since the lease code is serialized by the kernel - * lock) we know the server hasn't removed the lease yet, we know - * it's safe to take a reference: */ - atomic_inc(&dp->dl_count); + block_delegations(&dp->dl_stid.sc_file->fi_fhandle); + + /* + * We can't do this in nfsd_break_deleg_cb because it is + * already holding inode->i_lock. + * + * If the dl_time != 0, then we know that it has already been + * queued for a lease break. Don't queue it again. + */ + spin_lock(&state_lock); + if (dp->dl_time == 0) { + dp->dl_time = get_seconds(); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); + } + spin_unlock(&state_lock); +} + +static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + switch (task->tk_status) { + case 0: + return 1; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* + * Race: client probably got cb_recall before open reply + * granting delegation. + */ + if (dp->dl_retries--) { + rpc_delay(task, 2 * HZ); + return 0; + } + /*FALLTHRU*/ + default: + return -1; + } +} - list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); +static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); - /* Only place dl_time is set; protected by i_lock: */ - dp->dl_time = get_seconds(); + nfs4_put_stid(&dp->dl_stid); +} - block_delegations(&dp->dl_fh); +static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { + .prepare = nfsd4_cb_recall_prepare, + .done = nfsd4_cb_recall_done, + .release = nfsd4_cb_recall_release, +}; - nfsd4_cb_recall(dp); +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +{ + /* + * We're assuming the state code never drops its reference + * without first removing the lease. Since we're in this lease + * callback (and since the lease code is serialized by the kernel + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference. + */ + atomic_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&dp->dl_recall); } /* Called from break_lease() with i_lock held. */ -static void nfsd_break_deleg_cb(struct file_lock *fl) +static bool +nfsd_break_deleg_cb(struct file_lock *fl) { + bool ret = false; struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; struct nfs4_delegation *dp; if (!fp) { WARN(1, "(%p)->fl_owner NULL\n", fl); - return; + return ret; } if (fp->fi_had_conflict) { WARN(1, "duplicate break on %p\n", fp); - return; + return ret; } /* * We don't want the locks code to timeout the lease for us; @@ -2872,18 +3455,26 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&state_lock); + spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - nfsd_break_one_deleg(dp); - spin_unlock(&state_lock); + /* + * If there are no delegations on the list, then return true + * so that the lease code will go ahead and delete it. + */ + if (list_empty(&fp->fi_delegations)) + ret = true; + else + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); + spin_unlock(&fp->fi_lock); + return ret; } -static -int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) +static int +nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose) { if (arg & F_UNLCK) - return lease_modify(onlist, arg); + return lease_modify(onlist, arg, dispose); else return -EAGAIN; } @@ -2904,6 +3495,42 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } +static __be32 lookup_clientid(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ + struct nfs4_client *found; + + if (cstate->clp) { + found = cstate->clp; + if (!same_clid(&found->cl_clientid, clid)) + return nfserr_stale_clientid; + return nfs_ok; + } + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + /* + * For v4.1+ we get the client in the SEQUENCE op. If we don't have one + * cached already then we know this is for is for v4.0 and "sessions" + * will be false. + */ + WARN_ON_ONCE(cstate->session); + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, false, nn); + if (!found) { + spin_unlock(&nn->client_lock); + return nfserr_expired; + } + atomic_inc(&found->cl_refcount); + spin_unlock(&nn->client_lock); + + /* Cache the nfs4_client in cstate! */ + cstate->clp = found; + return nfs_ok; +} + __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct nfsd_net *nn) @@ -2924,19 +3551,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); - oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn); + status = lookup_clientid(clientid, cstate, nn); + if (status) + return status; + clp = cstate->clp; + + strhashval = ownerstr_hashval(&open->op_owner); + oo = find_openstateowner_str(strhashval, open, clp); open->op_openowner = oo; if (!oo) { - clp = find_confirmed_client(clientid, cstate->minorversion, - nn); - if (clp == NULL) - return nfserr_expired; goto new_owner; } if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ - clp = oo->oo_owner.so_client; release_openowner(oo); open->op_openowner = NULL; goto new_owner; @@ -2944,15 +3571,14 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; - clp = oo->oo_owner.so_client; goto alloc_stateid; new_owner: - oo = alloc_init_open_stateowner(strhashval, clp, open); + oo = alloc_init_open_stateowner(strhashval, open, cstate); if (oo == NULL) return nfserr_jukebox; open->op_openowner = oo; alloc_stateid: - open->op_stp = nfs4_alloc_stateid(clp); + open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; return nfs_ok; @@ -2994,14 +3620,18 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, { int flags; __be32 status = nfserr_bad_stateid; + struct nfs4_delegation *deleg; - *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); - if (*dp == NULL) + deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); + if (deleg == NULL) goto out; flags = share_access_to_flags(open->op_share_access); - status = nfs4_check_delegmode(*dp, flags); - if (status) - *dp = NULL; + status = nfs4_check_delegmode(deleg, flags); + if (status) { + nfs4_put_stid(&deleg->dl_stid); + goto out; + } + *dp = deleg; out: if (!nfsd4_is_deleg_cur(open)) return nfs_ok; @@ -3011,24 +3641,25 @@ out: return nfs_ok; } -static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_ol_stateid *local; + struct nfs4_ol_stateid *local, *ret = NULL; struct nfs4_openowner *oo = open->op_openowner; + spin_lock(&fp->fi_lock); list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - /* remember if we have seen this open owner */ - if (local->st_stateowner == &oo->oo_owner) - *stpp = local; - /* check for conflicting share reservations */ - if (!test_share(local, open)) - return nfserr_share_denied; + if (local->st_stateowner == &oo->oo_owner) { + ret = local; + atomic_inc(&ret->st_stid.sc_count); + break; + } } - return nfs_ok; + spin_unlock(&fp->fi_lock); + return ret; } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -3042,24 +3673,6 @@ static inline int nfs4_access_to_access(u32 nfs4_access) return flags; } -static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, - struct svc_fh *cur_fh, struct nfsd4_open *open) -{ - __be32 status; - int oflag = nfs4_access_to_omode(open->op_share_access); - int access = nfs4_access_to_access(open->op_share_access); - - if (!fp->fi_fds[oflag]) { - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, - &fp->fi_fds[oflag]); - if (status) - return status; - } - nfs4_file_get_access(fp, oflag); - - return nfs_ok; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -3075,34 +3688,99 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); } -static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access; - bool new_access; + struct file *filp = NULL; __be32 status; + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + unsigned char old_access_bmap, old_deny_bmap; - new_access = !test_access(op_share_access, stp); - if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); - if (status) - return status; + spin_lock(&fp->fi_lock); + + /* + * Are we trying to set a deny mode that would conflict with + * current access? + */ + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; } - status = nfsd4_truncate(rqstp, cur_fh, open); - if (status) { - if (new_access) { - int oflag = nfs4_access_to_omode(op_share_access); - nfs4_file_put_access(fp, oflag); - } - return status; + + /* set access to the file */ + status = nfs4_file_get_access(fp, open->op_share_access); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; } - /* remember the open */ - set_access(op_share_access, stp); + + /* Set access bits in stateid */ + old_access_bmap = stp->st_access_bmap; + set_access(open->op_share_access, stp); + + /* Set new deny mask */ + old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); - return nfs_ok; + if (!fp->fi_fds[oflag]) { + spin_unlock(&fp->fi_lock); + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + if (status) + goto out_put_access; + spin_lock(&fp->fi_lock); + if (!fp->fi_fds[oflag]) { + fp->fi_fds[oflag] = filp; + filp = NULL; + } + } + spin_unlock(&fp->fi_lock); + if (filp) + fput(filp); + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status) + goto out_put_access; +out: + return status; +out_put_access: + stp->st_access_bmap = old_access_bmap; + nfs4_file_put_access(fp, open->op_share_access); + reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); + goto out; } +static __be32 +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +{ + __be32 status; + unsigned char old_deny_bmap; + + if (!test_access(open->op_share_access, stp)) + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + + /* test and set deny mode */ + spin_lock(&fp->fi_lock); + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status == nfs_ok) { + old_deny_bmap = stp->st_deny_bmap; + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } + spin_unlock(&fp->fi_lock); + + if (status != nfs_ok) + return status; + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status != nfs_ok) + reset_union_bmap_deny(old_deny_bmap, stp); + return status; +} static void nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) @@ -3123,65 +3801,112 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) { struct file_lock *fl; fl = locks_alloc_lock(); if (!fl) return NULL; - locks_init_lock(fl); fl->fl_lmops = &nfsd_lease_mng_ops; fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)(dp->dl_file); + fl->fl_owner = (fl_owner_t)fp; fl->fl_pid = current->tgid; return fl; } static int nfs4_setlease(struct nfs4_delegation *dp) { - struct nfs4_file *fp = dp->dl_file; - struct file_lock *fl; - int status; + struct nfs4_file *fp = dp->dl_stid.sc_file; + struct file_lock *fl, *ret; + struct file *filp; + int status = 0; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ); if (!fl) return -ENOMEM; - fl->fl_file = find_readable_file(fp); - status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); + filp = find_readable_file(fp); + if (!filp) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return -EBADF; + } + fl->fl_file = filp; + ret = fl; + status = vfs_setlease(filp, fl->fl_type, &fl, NULL); + if (fl) + locks_free_lock(fl); if (status) - goto out_free; - fp->fi_lease = fl; - fp->fi_deleg_file = get_file(fl->fl_file); - atomic_set(&fp->fi_delegees, 1); + goto out_fput; spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* Did the lease get broken before we took the lock? */ + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + /* Race breaker */ + if (fp->fi_deleg_file) { + status = 0; + atomic_inc(&fp->fi_delegees); + hash_delegation_locked(dp, fp); + goto out_unlock; + } + fp->fi_deleg_file = filp; + atomic_set(&fp->fi_delegees, 1); hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); return 0; -out_free: - locks_free_lock(fl); +out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); +out_fput: + fput(filp); return status; } -static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) +static struct nfs4_delegation * +nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, + struct nfs4_file *fp) { + int status; + struct nfs4_delegation *dp; + if (fp->fi_had_conflict) - return -EAGAIN; + return ERR_PTR(-EAGAIN); + + dp = alloc_init_deleg(clp, fh); + if (!dp) + return ERR_PTR(-ENOMEM); + get_nfs4_file(fp); - dp->dl_file = fp; - if (!fp->fi_lease) - return nfs4_setlease(dp); spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + dp->dl_stid.sc_file = fp; + if (!fp->fi_deleg_file) { + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + status = nfs4_setlease(dp); + goto out; + } atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { - spin_unlock(&state_lock); - return -EAGAIN; + status = -EAGAIN; + goto out_unlock; } hash_delegation_locked(dp, fp); + status = 0; +out_unlock: + spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - return 0; +out: + if (status) { + nfs4_put_stid(&dp->dl_stid); + return ERR_PTR(status); + } + return dp; } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -3212,11 +3937,12 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) * proper support for them. */ static void -nfs4_open_delegation(struct net *net, struct svc_fh *fh, - struct nfsd4_open *open, struct nfs4_ol_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); + struct nfs4_openowner *oo = openowner(stp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; int cb_up; int status = 0; @@ -3235,7 +3961,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, * Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (locks_in_grace(net)) + if (locks_in_grace(clp->net)) goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; @@ -3254,21 +3980,17 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, default: goto out_no_deleg; } - dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh); - if (dp == NULL) + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + if (IS_ERR(dp)) goto out_no_deleg; - status = nfs4_set_delegation(dp, stp->st_file); - if (status) - goto out_free; memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", STATEID_VAL(&dp->dl_stid.sc_stateid)); open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + nfs4_put_stid(&dp->dl_stid); return; -out_free: - destroy_delegation(dp); out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && @@ -3301,16 +4023,12 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, */ } -/* - * called with nfs4_lock_state() held. - */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; - struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -3320,21 +4038,18 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_file(ino); - if (fp) { - if ((status = nfs4_check_open(fp, open, &stp))) - goto out; + fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + stp = nfsd4_find_existing_open(fp, open); } else { + open->op_file = NULL; status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; status = nfserr_jukebox; - fp = open->op_file; - open->op_file = NULL; - nfsd4_init_file(fp, ino); } /* @@ -3347,22 +4062,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (status) goto out; } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); - if (status) - goto out; - status = nfsd4_truncate(rqstp, current_fh, open); - if (status) - goto out; stp = open->op_stp; open->op_stp = NULL; init_open_stateid(stp, fp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { + release_open_stateid(stp); + goto out; + } } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) { - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_WANTED; @@ -3374,7 +4086,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp); + nfs4_open_delegation(current_fh, open, stp); nodeleg: status = nfs_ok; @@ -3397,41 +4109,27 @@ out: if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) + nfs4_put_stid(&dp->dl_stid); + if (stp) + nfs4_put_stid(&stp->st_stid); return status; } -void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, __be32 status) { if (open->op_openowner) { - struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_stateowner *so = &open->op_openowner->oo_owner; - if (!list_empty(&oo->oo_owner.so_stateids)) - list_del_init(&oo->oo_close_lru); - if (oo->oo_flags & NFS4_OO_NEW) { - if (status) { - release_openowner(oo); - open->op_openowner = NULL; - } else - oo->oo_flags &= ~NFS4_OO_NEW; - } + nfsd4_cstate_assign_replay(cstate, so); + nfs4_put_stateowner(so); } if (open->op_file) nfsd4_free_file(open->op_file); if (open->op_stp) - free_generic_stateid(open->op_stp); -} - -static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp) -{ - struct nfs4_client *found; - - if (STALE_CLIENTID(clid, nn)) - return nfserr_stale_clientid; - found = find_confirmed_client(clid, session, nn); - if (clp) - *clp = found; - return found ? nfs_ok : nfserr_expired; + nfs4_put_stid(&open->op_stp->st_stid); } __be32 @@ -3442,23 +4140,22 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate->minorversion, nn, &clp); + status = lookup_clientid(clid, cstate, nn); if (status) goto out; + clp = cstate->clp; status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) goto out; status = nfs_ok; out: - nfs4_unlock_state(); return status; } -static void +void nfsd4_end_grace(struct nfsd_net *nn) { /* do nothing if grace period already ended */ @@ -3467,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn) dprintk("NFSD: end of grace period\n"); nn->grace_ended = true; - nfsd4_record_grace_done(nn, nn->boot_time); + /* + * If the server goes down again right now, an NFSv4 + * client will still be allowed to reclaim after it comes back up, + * even if it hasn't yet had a chance to reclaim state this time. + * + */ + nfsd4_record_grace_done(nn); + /* + * At this point, NFSv4 clients can still reclaim. But if the + * server crashes, any that have not yet reclaimed will be out + * of luck on the next boot. + * + * (NFSv4.1+ clients are considered to have reclaimed once they + * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to + * have reclaimed after their first OPEN.) + */ locks_end_grace(&nn->nfsd4_manager); /* - * Now that every NFSv4 client has had the chance to recover and - * to see the (possibly new, possibly shorter) lease time, we - * can safely set the next grace time to the current lease time: + * At this point, and once lockd and/or any other containers + * exit their grace period, further reclaims will fail and + * regular locking can resume. */ - nn->nfsd4_grace = nn->nfsd4_lease; } static time_t @@ -3483,12 +4194,11 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_client *clp; struct nfs4_openowner *oo; struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; - nfs4_lock_state(); - dprintk("NFSD: laundromat service - starting\n"); nfsd4_end_grace(nn); INIT_LIST_HEAD(&reaplist); @@ -3505,13 +4215,14 @@ nfs4_laundromat(struct nfsd_net *nn) clp->cl_clientid.cl_id); continue; } - list_move(&clp->cl_lru, &reaplist); + list_add(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); dprintk("NFSD: purging unused client (clientid %08x)\n", clp->cl_clientid.cl_id); + list_del_init(&clp->cl_lru); expire_client(clp); } spin_lock(&state_lock); @@ -3524,24 +4235,37 @@ nfs4_laundromat(struct nfsd_net *nn) new_timeo = min(new_timeo, t); break; } - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); - list_for_each_safe(pos, next, &reaplist) { - dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + while (!list_empty(&reaplist)) { + dp = list_first_entry(&reaplist, struct nfs4_delegation, + dl_recall_lru); + list_del_init(&dp->dl_recall_lru); revoke_delegation(dp); } - list_for_each_safe(pos, next, &nn->close_lru) { - oo = container_of(pos, struct nfs4_openowner, oo_close_lru); - if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + + spin_lock(&nn->client_lock); + while (!list_empty(&nn->close_lru)) { + oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, + oo_close_lru); + if (time_after((unsigned long)oo->oo_time, + (unsigned long)cutoff)) { t = oo->oo_time - cutoff; new_timeo = min(new_timeo, t); break; } - release_openowner(oo); + list_del_init(&oo->oo_close_lru); + stp = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = NULL; + spin_unlock(&nn->client_lock); + nfs4_put_stid(&stp->st_stid); + spin_lock(&nn->client_lock); } + spin_unlock(&nn->client_lock); + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); - nfs4_unlock_state(); return new_timeo; } @@ -3564,7 +4288,7 @@ laundromat_main(struct work_struct *laundry) static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -3666,10 +4390,10 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; - __be32 status; + __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return nfserr_bad_stateid; + return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { char addr_str[INET6_ADDRSTRLEN]; @@ -3677,53 +4401,62 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) sizeof(addr_str)); pr_warn_ratelimited("NFSD: client %s testing state ID " "with incorrect client ID\n", addr_str); - return nfserr_bad_stateid; + return status; } - s = find_stateid(cl, stateid); + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); if (!s) - return nfserr_bad_stateid; + goto out_unlock; status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) - return status; + goto out_unlock; switch (s->sc_type) { case NFS4_DELEG_STID: - return nfs_ok; + status = nfs_ok; + break; case NFS4_REVOKED_DELEG_STID: - return nfserr_deleg_revoked; + status = nfserr_deleg_revoked; + break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: ols = openlockstateid(s); if (ols->st_stateowner->so_is_open_owner && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) - return nfserr_bad_stateid; - return nfs_ok; + status = nfserr_bad_stateid; + else + status = nfs_ok; + break; default: printk("unknown stateid type %x\n", s->sc_type); + /* Fallthrough */ case NFS4_CLOSED_STID: - return nfserr_bad_stateid; + case NFS4_CLOSED_DELEG_STID: + status = nfserr_bad_stateid; } +out_unlock: + spin_unlock(&cl->cl_lock); + return status; } -static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, bool sessions, - struct nfsd_net *nn) +static __be32 +nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn) { - struct nfs4_client *cl; __be32 status; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, - nn, &cl); + status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { - if (sessions) + if (cstate->session) return nfserr_bad_stateid; return nfserr_stale_stateid; } if (status) return status; - *s = find_stateid_by_type(cl, stateid, typemask); + *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; return nfs_ok; @@ -3754,12 +4487,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); - nfs4_lock_state(); - - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + &s, nn); if (status) - goto out; + return status; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3770,12 +4502,13 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { - file = dp->dl_file->fi_deleg_file; + file = dp->dl_stid.sc_file->fi_deleg_file; if (!file) { WARN_ON_ONCE(1); status = nfserr_serverfault; goto out; } + get_file(file); } break; case NFS4_OPEN_STID: @@ -3791,10 +4524,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { + struct nfs4_file *fp = stp->st_stid.sc_file; + if (flags & RD_STATE) - file = find_readable_file(stp->st_file); + file = find_readable_file(fp); else - file = find_writeable_file(stp->st_file); + file = find_writeable_file(fp); } break; default: @@ -3803,28 +4538,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, } status = nfs_ok; if (file) - *filpp = get_file(file); + *filpp = file; out: - nfs4_unlock_state(); + nfs4_put_stid(s); return status; } -static __be32 -nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) -{ - struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - - if (check_for_locks(stp->st_file, lo)) - return nfserr_locks_held; - /* - * Currently there's a 1-1 lock stateid<->lockowner - * correspondance, and we have to delete the lockowner when we - * delete the lock stateid: - */ - release_lockowner(lo); - return nfs_ok; -} - /* * Test if the stateid is valid */ @@ -3835,11 +4554,9 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid_id *stateid; struct nfs4_client *cl = cstate->session->se_client; - nfs4_lock_state(); list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); - nfs4_unlock_state(); return nfs_ok; } @@ -3851,37 +4568,50 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; - nfs4_lock_state(); - s = find_stateid(cl, stateid); + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); if (!s) - goto out; + goto out_unlock; switch (s->sc_type) { case NFS4_DELEG_STID: ret = nfserr_locks_held; - goto out; + break; case NFS4_OPEN_STID: - case NFS4_LOCK_STID: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) - goto out; - if (s->sc_type == NFS4_LOCK_STID) - ret = nfsd4_free_lock_stateid(openlockstateid(s)); - else - ret = nfserr_locks_held; + break; + ret = nfserr_locks_held; break; + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + break; + stp = openlockstateid(s); + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + break; + unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); - destroy_revoked_delegation(dp); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); ret = nfs_ok; - break; - default: - ret = nfserr_bad_stateid; + goto out; + /* Default falls through and returns nfserr_bad_stateid */ } +out_unlock: + spin_unlock(&cl->cl_lock); out: - nfs4_unlock_state(); return ret; } @@ -3926,20 +4656,24 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, { __be32 status; struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - status = nfsd4_lookup_stateid(stateid, typemask, &s, - cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); if (status) return status; - *stpp = openlockstateid(s); - if (!nfsd4_has_session(cstate)) - cstate->replay_owner = (*stpp)->st_stateowner; + stp = openlockstateid(s); + nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); - return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); + status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); + if (!status) + *stpp = stp; + else + nfs4_put_stid(&stp->st_stid); + return status; } static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, @@ -3947,14 +4681,18 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs { __be32 status; struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, stpp, nn); + NFS4_OPEN_STID, &stp, nn); if (status) return status; - oo = openowner((*stpp)->st_stateowner); - if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; + } + *stpp = stp; return nfs_ok; } @@ -3974,8 +4712,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - nfs4_lock_state(); - status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, NFS4_OPEN_STID, &stp, nn); @@ -3984,7 +4720,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) - goto out; + goto put_stateid; oo->oo_flags |= NFS4_OO_CONFIRMED; update_stateid(&stp->st_stid.sc_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -3993,10 +4729,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -4004,7 +4740,7 @@ static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 a { if (!test_access(access, stp)) return; - nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + nfs4_file_put_access(stp->st_stid.sc_file, access); clear_access(access, stp); } @@ -4026,16 +4762,6 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac } } -static void -reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp) -{ - int i; - for (i = 0; i < 4; i++) { - if ((i & deny) != i) - clear_deny(i, stp); - } -} - __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -4053,21 +4779,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, od->od_deleg_want); - nfs4_lock_state(); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, &od->od_stateid, &stp, nn); if (status) goto out; status = nfserr_inval; if (!test_access(od->od_share_access, stp)) { - dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n", + dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", stp->st_access_bmap, od->od_share_access); - goto out; + goto put_stateid; } if (!test_deny(od->od_share_deny, stp)) { - dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", + dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", stp->st_deny_bmap, od->od_share_deny); - goto out; + goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); @@ -4076,17 +4801,31 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, update_stateid(&stp->st_stid.sc_stateid); memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { - unhash_open_stateid(s); + struct nfs4_client *clp = s->st_stid.sc_client; + LIST_HEAD(reaplist); + s->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&clp->cl_lock); + unhash_open_stateid(s, &reaplist); + + if (clp->cl_minorversion) { + put_ol_stateid_locked(s, &reaplist); + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + } else { + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + move_to_close_lru(s, clp->net); + } } /* @@ -4097,7 +4836,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; - struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -4105,7 +4843,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); - nfs4_lock_state(); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, NFS4_OPEN_STID|NFS4_CLOSED_STID, @@ -4113,31 +4850,14 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; - oo = openowner(stp->st_stateowner); update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); nfsd4_close_open_stateid(stp); - if (cstate->minorversion) - free_generic_stateid(stp); - else - oo->oo_last_closed_stid = stp; - - if (list_empty(&oo->oo_owner.so_stateids)) { - if (cstate->minorversion) - release_openowner(oo); - else { - /* - * In the 4.0 case we need to keep the owners around a - * little while to handle CLOSE replay. - */ - move_to_close_lru(oo, SVC_NET(rqstp)); - } - } + /* put reference from nfs4_preprocess_seqid_op */ + nfs4_put_stid(&stp->st_stid); out: - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -4154,28 +4874,24 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - nfs4_lock_state(); - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, - cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); if (status) goto out; dp = delegstateid(s); status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) - goto out; + goto put_stateid; destroy_delegation(dp); +put_stateid: + nfs4_put_stid(&dp->dl_stid); out: - nfs4_unlock_state(); - return status; } #define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) -#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) - static inline u64 end_offset(u64 start, u64 len) { @@ -4196,13 +4912,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) -{ - return (file_hashval(inode) + cl_id - + opaque_hashval(ownername->data, ownername->len)) - & LOCKOWNER_INO_HASH_MASK; -} - /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th @@ -4220,9 +4929,25 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -/* Hack!: For now, we're defining this just so we can use a pointer to it - * as a unique cookie to identify our (NFSv4's) posix locks. */ +static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner; + dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner)); +} + +static void nfsd4_fl_put_owner(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + + if (lo) { + nfs4_put_stateowner(&lo->lo_owner); + fl->fl_owner = NULL; + } +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_get_owner = nfsd4_fl_get_owner, + .lm_put_owner = nfsd4_fl_put_owner, }; static inline void @@ -4255,47 +4980,54 @@ nevermind: deny->ld_type = NFS4_WRITE_LT; } -static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) +static struct nfs4_lockowner * +find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) { - struct nfs4_ol_stateid *lst; + unsigned int strhashval = ownerstr_hashval(owner); + struct nfs4_stateowner *so; - if (!same_owner_str(&lo->lo_owner, owner, clid)) - return false; - if (list_empty(&lo->lo_owner.so_stateids)) { - WARN_ON_ONCE(1); - return false; + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], + so_strhash) { + if (so->so_is_open_owner) + continue; + if (same_owner_str(so, owner)) + return lockowner(nfs4_get_stateowner(so)); } - lst = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - return lst->st_file->fi_inode == inode; + return NULL; } static struct nfs4_lockowner * -find_lockowner_str(struct inode *inode, clientid_t *clid, - struct xdr_netobj *owner, struct nfsd_net *nn) +find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) { - unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); struct nfs4_lockowner *lo; - list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { - if (same_lockowner_ino(lo, inode, clid, owner)) - return lo; - } - return NULL; + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clid, owner, clp); + spin_unlock(&clp->cl_lock); + return lo; } -static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) +static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) { - struct inode *inode = open_stp->st_file->fi_inode; - unsigned int inohash = lockowner_ino_hashval(inode, - clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + unhash_lockowner_locked(lockowner(sop)); +} - list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); - list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]); - list_add(&lo->lo_perstateid, &open_stp->st_lockowners); +static void nfs4_free_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_lockowner *lo = lockowner(sop); + + kmem_cache_free(lockowner_slab, lo); } +static const struct nfs4_stateowner_operations lockowner_ops = { + .so_unhash = nfs4_unhash_lockowner, + .so_free = nfs4_free_lockowner, +}; + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -4303,42 +5035,106 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s * * strhashval = ownerstr_hashval */ - static struct nfs4_lockowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_lockowner *lo; +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, + struct nfs4_ol_stateid *open_stp, + struct nfsd4_lock *lock) +{ + struct nfs4_lockowner *lo, *ret; lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; - /* It is the openowner seqid that will be incremented in encode in the - * case of new lockowners; so increment the lock seqid manually: */ - lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; - hash_lockowner(lo, strhashval, clp, open_stp); + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; + lo->lo_owner.so_ops = &lockowner_ops; + spin_lock(&clp->cl_lock); + ret = find_lockowner_str_locked(&clp->cl_clientid, + &lock->lk_new_owner, clp); + if (ret == NULL) { + list_add(&lo->lo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + ret = lo; + } else + nfs4_free_lockowner(&lo->lo_owner); + spin_unlock(&clp->cl_lock); return lo; } -static struct nfs4_ol_stateid * -alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) +static void +init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, + struct nfs4_file *fp, struct inode *inode, + struct nfs4_ol_stateid *open_stp) { - struct nfs4_ol_stateid *stp; struct nfs4_client *clp = lo->lo_owner.so_client; - stp = nfs4_alloc_stateid(clp); - if (stp == NULL) - return NULL; + lockdep_assert_held(&clp->cl_lock); + + atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; - list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); - stp->st_stateowner = &lo->lo_owner; + stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); - stp->st_file = fp; + stp->st_stid.sc_file = fp; + stp->st_stid.sc_free = nfs4_free_lock_stateid; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - return stp; + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); +} + +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_file == fp) { + atomic_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * +find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, + struct inode *inode, struct nfs4_ol_stateid *ost, + bool *new) +{ + struct nfs4_stid *ns = NULL; + struct nfs4_ol_stateid *lst; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *clp = oo->oo_owner.so_client; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (lst == NULL) { + spin_unlock(&clp->cl_lock); + ns = nfs4_alloc_stid(clp, stateid_slab); + if (ns == NULL) + return NULL; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (likely(!lst)) { + lst = openlockstateid(ns); + init_lock_stateid(lst, lo, fi, inode, ost); + ns = NULL; + *new = true; + } + } + spin_unlock(&clp->cl_lock); + if (ns) + nfs4_put_stid(ns); + return lst; } static int @@ -4350,46 +5146,53 @@ check_lock_length(u64 offset, u64 length) static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { - struct nfs4_file *fp = lock_stp->st_file; - int oflag = nfs4_access_to_omode(access); + struct nfs4_file *fp = lock_stp->st_stid.sc_file; + + lockdep_assert_held(&fp->fi_lock); if (test_access(access, lock_stp)) return; - nfs4_file_get_access(fp, oflag); + __nfs4_file_get_access(fp, access); set_access(access, lock_stp); } -static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) +static __be32 +lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, + struct nfs4_ol_stateid *ost, + struct nfsd4_lock *lock, + struct nfs4_ol_stateid **lst, bool *new) { - struct nfs4_file *fi = ost->st_file; + __be32 status; + struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; + struct inode *inode = cstate->current_fh.fh_dentry->d_inode; struct nfs4_lockowner *lo; unsigned int strhashval; - struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id); - lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, - &lock->v.new.owner, nn); - if (lo) { - if (!cstate->minorversion) - return nfserr_bad_seqid; - /* XXX: a lockowner always has exactly one stateid: */ - *lst = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - return nfs_ok; + lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl); + if (!lo) { + strhashval = ownerstr_hashval(&lock->v.new.owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + } else { + /* with an existing lockowner, seqids must be the same */ + status = nfserr_bad_seqid; + if (!cstate->minorversion && + lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) + goto out; } - strhashval = ownerstr_hashval(cl->cl_clientid.cl_id, - &lock->v.new.owner); - lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); - if (lo == NULL) - return nfserr_jukebox; - *lst = alloc_init_lock_stateid(lo, fi, ost); + + *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (*lst == NULL) { - release_lockowner(lo); - return nfserr_jukebox; + status = nfserr_jukebox; + goto out; } - *new = true; - return nfs_ok; + status = nfs_ok; +out: + nfs4_put_stateowner(&lo->lo_owner); + return status; } /* @@ -4401,14 +5204,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_openowner *open_sop = NULL; struct nfs4_lockowner *lock_sop = NULL; - struct nfs4_ol_stateid *lock_stp; + struct nfs4_ol_stateid *lock_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; + struct nfs4_file *fp; struct file *filp = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; - bool new_state = false; int lkflg; int err; + bool new = false; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -4425,11 +5230,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - nfs4_lock_state(); - if (lock->lk_is_new) { - struct nfs4_ol_stateid *open_stp = NULL; - if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->v.new.clientid, @@ -4453,12 +5254,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock->v.new.clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, - &lock_stp, &new_state); - } else + &lock_stp, &new); + } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, NFS4_LOCK_STID, &lock_stp, nn); + } if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); @@ -4482,20 +5284,24 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - locks_init_lock(file_lock); + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: - filp = find_readable_file(lock_stp->st_file); + spin_lock(&fp->fi_lock); + filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); + spin_unlock(&fp->fi_lock); file_lock->fl_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - filp = find_writeable_file(lock_stp->st_file); + spin_lock(&fp->fi_lock); + filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); + spin_unlock(&fp->fi_lock); file_lock->fl_type = F_WRLCK; break; default: @@ -4506,7 +5312,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_openmode; goto out; } - file_lock->fl_owner = (fl_owner_t)lock_sop; + + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; file_lock->fl_flags = FL_POSIX; @@ -4544,11 +5351,27 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: - if (status && new_state) - release_lockowner(lock_sop); + if (filp) + fput(filp); + if (lock_stp) { + /* Bump seqid manually if the 4.0 replay owner is openowner */ + if (cstate->replay_owner && + cstate->replay_owner != &lock_sop->lo_owner && + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. + */ + if (status && new) + release_lock_stateid(lock_stp); + + nfs4_put_stid(&lock_stp->st_stid); + } + if (open_stp) + nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); if (file_lock) locks_free_lock(file_lock); if (conflock) @@ -4580,9 +5403,8 @@ __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lockt *lockt) { - struct inode *inode; struct file_lock *file_lock = NULL; - struct nfs4_lockowner *lo; + struct nfs4_lockowner *lo = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -4592,10 +5414,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - nfs4_lock_state(); - if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL); + status = lookup_clientid(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -4603,14 +5423,13 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - inode = cstate->current_fh.fh_dentry->d_inode; file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; goto out; } - locks_init_lock(file_lock); + switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: @@ -4626,7 +5445,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn); + lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner, + cstate->clp); if (lo) file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; @@ -4646,7 +5466,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } out: - nfs4_unlock_state(); + if (lo) + nfs4_put_stateowner(&lo->lo_owner); if (file_lock) locks_free_lock(file_lock); return status; @@ -4670,27 +5491,25 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(locku->lu_offset, locku->lu_length)) return nfserr_inval; - nfs4_lock_state(); - status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, NFS4_LOCK_STID, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_file); + filp = find_any_file(stp->st_stid.sc_file); if (!filp) { status = nfserr_lock_range; - goto out; + goto put_stateid; } file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto out; + goto fput; } - locks_init_lock(file_lock); + file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; file_lock->fl_flags = FL_POSIX; @@ -4708,41 +5527,51 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } update_stateid(&stp->st_stid.sc_stateid); memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - +fput: + fput(filp); +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); if (file_lock) locks_free_lock(file_lock); return status; out_nfserr: status = nfserrno(err); - goto out; + goto fput; } /* * returns - * 1: locks held by lockowner - * 0: no locks held by lockowner + * true: locks held by lockowner + * false: no locks held by lockowner */ -static int -check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) +static bool +check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; - struct inode *inode = filp->fi_inode; - int status = 0; + int status = false; + struct file *filp = find_any_file(fp); + struct inode *inode; + + if (!filp) { + /* Any valid lock stateid should have some sort of access */ + WARN_ON_ONCE(1); + return status; + } + + inode = file_inode(filp); spin_lock(&inode->i_lock); for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { if ((*flpp)->fl_owner == (fl_owner_t)lowner) { - status = 1; - goto out; + status = true; + break; } } -out: spin_unlock(&inode->i_lock); + fput(filp); return status; } @@ -4753,53 +5582,46 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo; + struct nfs4_lockowner *lo = NULL; struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; - struct list_head matches; - unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); + unsigned int hashval = ownerstr_hashval(owner); __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_client *clp; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - nfs4_lock_state(); - - status = lookup_clientid(clid, cstate->minorversion, nn, NULL); + status = lookup_clientid(clid, cstate, nn); if (status) - goto out; + return status; - status = nfserr_locks_held; - INIT_LIST_HEAD(&matches); + clp = cstate->clp; + /* Find the matching lock stateowner */ + spin_lock(&clp->cl_lock); + list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { - list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) { - if (sop->so_is_open_owner) + if (sop->so_is_open_owner || !same_owner_str(sop, owner)) continue; - if (!same_owner_str(sop, owner, clid)) - continue; - list_for_each_entry(stp, &sop->so_stateids, - st_perstateowner) { - lo = lockowner(sop); - if (check_for_locks(stp->st_file, lo)) - goto out; - list_add(&lo->lo_list, &matches); + + /* see if there are still any locks associated with it */ + lo = lockowner(sop); + list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + status = nfserr_locks_held; + spin_unlock(&clp->cl_lock); + return status; + } } + + nfs4_get_stateowner(sop); + break; } - /* Clients probably won't expect us to return with some (but not all) - * of the lockowner state released; so don't release any until all - * have been checked. */ - status = nfs_ok; - while (!list_empty(&matches)) { - lo = list_entry(matches.next, struct nfs4_lockowner, - lo_list); - /* unhash_stateowner deletes so_perclient only - * for openowners. */ - list_del(&lo->lo_list); + spin_unlock(&clp->cl_lock); + if (lo) release_lockowner(lo); - } -out: - nfs4_unlock_state(); return status; } @@ -4887,34 +5709,126 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) * Called from OPEN. Look for clientid in reclaim list. */ __be32 -nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) +nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) { - struct nfs4_client *clp; + __be32 status; /* find clientid in conf_id_hashtbl */ - clp = find_confirmed_client(clid, sessions, nn); - if (clp == NULL) + status = lookup_clientid(clid, cstate, nn); + if (status) + return nfserr_reclaim_bad; + + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + return nfserr_no_grace; + + if (nfsd4_client_record_check(cstate->clp)) return nfserr_reclaim_bad; - return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok; + return nfs_ok; } #ifdef CONFIG_NFSD_FAULT_INJECTION +static inline void +put_client(struct nfs4_client *clp) +{ + atomic_dec(&clp->cl_refcount); +} -u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) +static struct nfs4_client * +nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) { - if (mark_client_expired(clp)) - return 0; - expire_client(clp); - return 1; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return NULL; + + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + if (memcmp(&clp->cl_addr, addr, addr_size) == 0) + return clp; + } + return NULL; } -u64 nfsd_print_client(struct nfs4_client *clp, u64 num) +u64 +nfsd_inject_print_clients(void) { + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFS Client: %s\n", buf); - return 1; + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); + pr_info("NFS Client: %s\n", buf); + ++count; + } + spin_unlock(&nn->client_lock); + + return count; +} + +u64 +nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) { + if (mark_client_expired_locked(clp) == nfs_ok) + ++count; + else + clp = NULL; + } + spin_unlock(&nn->client_lock); + + if (clp) + expire_client(clp); + + return count; +} + +u64 +nfsd_inject_forget_clients(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + if (mark_client_expired_locked(clp) == nfs_ok) { + list_add(&clp->cl_lru, &reaplist); + if (max != 0 && ++count >= max) + break; + } + } + spin_unlock(&nn->client_lock); + + list_for_each_entry_safe(clp, next, &reaplist, cl_lru) + expire_client(clp); + + return count; } static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, @@ -4925,158 +5839,484 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); } -static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *)) +static void +nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, + struct list_head *collect) +{ + struct nfs4_client *clp = lst->st_stid.sc_client; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!collect) + return; + + lockdep_assert_held(&nn->client_lock); + atomic_inc(&clp->cl_refcount); + list_add(&lst->st_locks, collect); +} + +static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_ol_stateid *)) { struct nfs4_openowner *oop; - struct nfs4_lockowner *lop, *lo_next; struct nfs4_ol_stateid *stp, *st_next; + struct nfs4_ol_stateid *lst, *lst_next; u64 count = 0; + spin_lock(&clp->cl_lock); list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { - list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) { - list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) { - if (func) - func(lop); - if (++count == max) - return count; + list_for_each_entry_safe(stp, st_next, + &oop->oo_owner.so_stateids, st_perstateowner) { + list_for_each_entry_safe(lst, lst_next, + &stp->st_locks, st_locks) { + if (func) { + func(lst); + nfsd_inject_add_lock_to_list(lst, + collect); + } + ++count; + /* + * Despite the fact that these functions deal + * with 64-bit integers for "count", we must + * ensure that it doesn't blow up the + * clp->cl_refcount. Throw a warning if we + * start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + goto out; } } } +out: + spin_unlock(&clp->cl_lock); return count; } -u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) +static u64 +nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect, + u64 max) { - return nfsd_foreach_client_lock(clp, max, release_lockowner); + return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid); } -u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max) +static u64 +nfsd_print_client_locks(struct nfs4_client *clp) { - u64 count = nfsd_foreach_client_lock(clp, max, NULL); + u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL); nfsd_print_count(clp, count, "locked files"); return count; } -static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *)) +u64 +nfsd_inject_print_locks(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_locks(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_reap_locks(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_ol_stateid *stp, *next; + + list_for_each_entry_safe(stp, next, reaplist, st_locks) { + list_del_init(&stp->st_locks); + clp = stp->st_stid.sc_client; + nfs4_put_stid(&stp->st_stid); + put_client(clp); + } +} + +u64 +nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_locks(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +u64 +nfsd_inject_forget_locks(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_locks(clp, &reaplist, max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +static u64 +nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_openowner *)) { struct nfs4_openowner *oop, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); u64 count = 0; + lockdep_assert_held(&nn->client_lock); + + spin_lock(&clp->cl_lock); list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { - if (func) + if (func) { func(oop); - if (++count == max) + if (collect) { + atomic_inc(&clp->cl_refcount); + list_add(&oop->oo_perclient, collect); + } + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) break; } + spin_unlock(&clp->cl_lock); return count; } -u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) +static u64 +nfsd_print_client_openowners(struct nfs4_client *clp) { - return nfsd_foreach_client_open(clp, max, release_openowner); + u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL); + + nfsd_print_count(clp, count, "openowners"); + return count; } -u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max) +static u64 +nfsd_collect_client_openowners(struct nfs4_client *clp, + struct list_head *collect, u64 max) { - u64 count = nfsd_foreach_client_open(clp, max, NULL); - nfsd_print_count(clp, count, "open files"); - return count; + return nfsd_foreach_client_openowner(clp, max, collect, + unhash_openowner_locked); } -static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, - struct list_head *victims) +u64 +nfsd_inject_print_openowners(void) { - struct nfs4_delegation *dp, *next; + struct nfs4_client *clp; u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_openowners(clp); + spin_unlock(&nn->client_lock); - lockdep_assert_held(&state_lock); - list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { - if (victims) - list_move(&dp->dl_recall_lru, victims); - if (++count == max) - break; - } return count; } -u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) +static void +nfsd_reap_openowners(struct list_head *reaplist) { - struct nfs4_delegation *dp, *next; - LIST_HEAD(victims); - u64 count; + struct nfs4_client *clp; + struct nfs4_openowner *oop, *next; - spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, &victims); - spin_unlock(&state_lock); + list_for_each_entry_safe(oop, next, reaplist, oo_perclient) { + list_del_init(&oop->oo_perclient); + clp = oop->oo_owner.so_client; + release_openowner(oop); + put_client(clp); + } +} - list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - revoke_delegation(dp); +u64 +nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr, + size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_openowners(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); return count; } -u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) +u64 +nfsd_inject_forget_openowners(u64 max) { - struct nfs4_delegation *dp, *next; - LIST_HEAD(victims); - u64 count; + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); - spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, &victims); - list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - nfsd_break_one_deleg(dp); - spin_unlock(&state_lock); + if (!nfsd_netns_ready(nn)) + return count; + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_openowners(clp, &reaplist, + max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); return count; } -u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) +static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, + struct list_head *victims) { + struct nfs4_delegation *dp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); u64 count = 0; + lockdep_assert_held(&nn->client_lock); + spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, NULL); + list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { + if (victims) { + /* + * It's not safe to mess with delegations that have a + * non-zero dl_time. They might have already been broken + * and could be processed by the laundromat outside of + * the state_lock. Just leave them be. + */ + if (dp->dl_time != 0) + continue; + + atomic_inc(&clp->cl_refcount); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, victims); + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + break; + } spin_unlock(&state_lock); + return count; +} + +static u64 +nfsd_print_client_delegations(struct nfs4_client *clp) +{ + u64 count = nfsd_find_all_delegations(clp, 0, NULL); nfsd_print_count(clp, count, "delegations"); return count; } -u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) +u64 +nfsd_inject_print_delegations(void) { - struct nfs4_client *clp, *next; + struct nfs4_client *clp; u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); if (!nfsd_netns_ready(nn)) return 0; - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - count += func(clp, max - count); - if ((max != 0) && (count >= max)) - break; + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_delegations(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_forget_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + revoke_delegation(dp); + put_client(clp); } +} + +u64 +nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_forget_delegations(&reaplist); return count; } -struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) +u64 +nfsd_inject_forget_delegations(u64 max) { + u64 count = 0; struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); if (!nfsd_netns_ready(nn)) - return NULL; + return count; + spin_lock(&nn->client_lock); list_for_each_entry(clp, &nn->client_lru, cl_lru) { - if (memcmp(&clp->cl_addr, addr, addr_size) == 0) - return clp; + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && count >= max) + break; } - return NULL; + spin_unlock(&nn->client_lock); + nfsd_forget_delegations(&reaplist); + return count; +} + +static void +nfsd_recall_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + /* + * We skipped all entries that had a zero dl_time before, + * so we can now reset the dl_time back to 0. If a delegation + * break comes in now, then it won't make any difference since + * we're recalling it either way. + */ + spin_lock(&state_lock); + dp->dl_time = 0; + spin_unlock(&state_lock); + nfsd_break_one_deleg(dp); + put_client(clp); + } +} + +u64 +nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_recall_delegations(&reaplist); + return count; } +u64 +nfsd_inject_recall_delegations(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && ++count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_recall_delegations(&reaplist); + return count; +} #endif /* CONFIG_NFSD_FAULT_INJECTION */ /* @@ -5113,14 +6353,6 @@ static int nfs4_state_create_net(struct net *net) CLIENT_HASH_SIZE, GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; - nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * - OWNER_HASH_SIZE, GFP_KERNEL); - if (!nn->ownerstr_hashtbl) - goto err_ownerstr; - nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) * - LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL); - if (!nn->lockowner_ino_hashtbl) - goto err_lockowner_ino; nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * SESSION_HASH_SIZE, GFP_KERNEL); if (!nn->sessionid_hashtbl) @@ -5130,10 +6362,6 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) - INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); - for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) - INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]); for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; @@ -5149,10 +6377,6 @@ static int nfs4_state_create_net(struct net *net) return 0; err_sessionid: - kfree(nn->lockowner_ino_hashtbl); -err_lockowner_ino: - kfree(nn->ownerstr_hashtbl); -err_ownerstr: kfree(nn->unconf_id_hashtbl); err_unconf_id: kfree(nn->conf_id_hashtbl); @@ -5182,8 +6406,6 @@ nfs4_state_destroy_net(struct net *net) } kfree(nn->sessionid_hashtbl); - kfree(nn->lockowner_ino_hashtbl); - kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); put_net(net); @@ -5198,10 +6420,10 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nfsd4_client_tracking_init(net); nn->boot_time = get_seconds(); - locks_start_grace(net, &nn->nfsd4_manager); nn->grace_ended = false; + locks_start_grace(net, &nn->nfsd4_manager); + nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", nn->nfsd4_grace, net); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); @@ -5247,22 +6469,23 @@ nfs4_state_shutdown_net(struct net *net) cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); - nfs4_lock_state(); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - destroy_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); } nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - nfs4_unlock_state(); } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 944275c8f56d..eeea7a90eb87 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -31,13 +31,6 @@ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * TODO: Neil Brown made the following observation: We currently - * initially reserve NFSD_BUFSIZE space on the transmit queue and - * never release any of that until the request is complete. - * It would be good to calculate a new maximum response size while - * decoding the COMPOUND, and call svc_reserve with this number - * at the end of nfs4svc_decode_compoundargs. */ #include <linux/slab.h> @@ -181,28 +174,43 @@ static int zero_clientid(clientid_t *clid) } /** - * defer_free - mark an allocation as deferred freed - * @argp: NFSv4 compound argument structure to be freed with - * @release: release callback to free @p, typically kfree() - * @p: pointer to be freed + * svcxdr_tmpalloc - allocate memory to be freed after compound processing + * @argp: NFSv4 compound argument structure + * @p: pointer to be freed (with kfree()) * * Marks @p to be freed when processing the compound operation * described in @argp finishes. */ -static int -defer_free(struct nfsd4_compoundargs *argp, - void (*release)(const void *), void *p) +static void * +svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) { - struct tmpbuf *tb; + struct svcxdr_tmpbuf *tb; - tb = kmalloc(sizeof(*tb), GFP_KERNEL); + tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL); if (!tb) - return -ENOMEM; - tb->buf = p; - tb->release = release; + return NULL; tb->next = argp->to_free; argp->to_free = tb; - return 0; + return tb->buf; +} + +/* + * For xdr strings that need to be passed to other kernel api's + * as null-terminated strings. + * + * Note null-terminating in place usually isn't safe since the + * buffer might end on a page boundary. + */ +static char * +svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) +{ + char *p = svcxdr_tmpalloc(argp, len + 1); + + if (!p) + return NULL; + memcpy(p, buf, len); + p[len] = '\0'; + return p; } /** @@ -217,19 +225,13 @@ defer_free(struct nfsd4_compoundargs *argp, */ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { - if (p == argp->tmp) { - p = kmemdup(argp->tmp, nbytes, GFP_KERNEL); - if (!p) - return NULL; - } else { - BUG_ON(p != argp->tmpp); - argp->tmpp = NULL; - } - if (defer_free(argp, kfree, p)) { - kfree(p); + void *ret; + + ret = svcxdr_tmpalloc(argp, nbytes); + if (!ret) return NULL; - } else - return (char *)p; + memcpy(ret, p, nbytes); + return ret; } static __be32 @@ -292,12 +294,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (nace > NFS4_ACL_MAX) return nfserr_fbig; - *acl = nfs4_acl_new(nace); + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); if (*acl == NULL) return nfserr_jukebox; - defer_free(argp, kfree, *acl); - (*acl)->naces = nace; for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; @@ -418,12 +418,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_badlabel; len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - label->data = kzalloc(dummy32 + 1, GFP_KERNEL); + label->len = dummy32; + label->data = svcxdr_dupstr(argp, buf, dummy32); if (!label->data) return nfserr_jukebox; - label->len = dummy32; - defer_free(argp, kfree, label->data); - memcpy(label->data, buf, dummy32); } #endif @@ -598,20 +596,11 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create switch (create->cr_type) { case NF4LNK: READ_BUF(4); - create->cr_linklen = be32_to_cpup(p++); - READ_BUF(create->cr_linklen); - /* - * The VFS will want a null-terminated string, and - * null-terminating in place isn't safe since this might - * end on a page boundary: - */ - create->cr_linkname = - kmalloc(create->cr_linklen + 1, GFP_KERNEL); - if (!create->cr_linkname) + create->cr_datalen = be32_to_cpup(p++); + READ_BUF(create->cr_datalen); + create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); + if (!create->cr_data) return nfserr_jukebox; - memcpy(create->cr_linkname, p, create->cr_linklen); - create->cr_linkname[create->cr_linklen] = '\0'; - defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: @@ -1481,13 +1470,12 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta INIT_LIST_HEAD(&test_stateid->ts_stateid_list); for (i = 0; i < test_stateid->ts_num_ids; i++) { - stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL); + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); if (!stateid) { status = nfserrno(-ENOMEM); goto out; } - defer_free(argp, kfree, stateid); INIT_LIST_HEAD(&stateid->ts_id_list); list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); @@ -1526,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str } static __be32 +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + if (status) + return status; + + READ_BUF(8 + 4); + p = xdr_decode_hyper(p, &seek->seek_offset); + seek->seek_whence = be32_to_cpup(p); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { return nfs_ok; @@ -1598,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + + /* new operations for NFSv4.2 */ + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, + [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, }; static inline bool @@ -1640,7 +1658,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) goto xdr_error; if (argp->opcnt > ARRAY_SIZE(argp->iops)) { - argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); if (!argp->ops) { argp->ops = argp->iops; dprintk("nfsd: couldn't allocate room for COMPOUND\n"); @@ -1675,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) readbytes += nfsd4_max_reply(argp->rqstp, op); } else max_reply += nfsd4_max_reply(argp->rqstp, op); + /* + * OP_LOCK may return a conflicting lock. (Special case + * because it will just skip encoding this if it runs + * out of xdr buffer space, and it is the only operation + * that behaves this way.) + */ + if (op->opnum == OP_LOCK) + max_reply += NFS4_OPAQUE_LIMIT; if (op->status) { argp->opcnt = i+1; @@ -2662,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, struct xdr_stream *xdr = cd->xdr; int start_offset = xdr->buf->len; int cookie_offset; + u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; __be64 wire_offset; @@ -2723,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, cd->rd_maxcount -= entry_bytes; if (!cd->rd_dircount) goto fail; - cd->rd_dircount--; + /* + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so + * let's always let through the first entry, at least: + */ + name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; @@ -3077,11 +3111,8 @@ static __be32 nfsd4_encode_splice_read( __be32 nfserr; __be32 *p = xdr->p - 2; - /* - * Don't inline pages unless we know there's room for eof, - * count, and possible padding: - */ - if (xdr->end - xdr->p < 3) + /* Make sure there will be room for padding if needed */ + if (xdr->end - xdr->p < 1) return nfserr_resource; nfserr = nfsd_splice_read(read->rd_rqstp, file, @@ -3104,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read( buf->page_len = maxcount; buf->len += maxcount; - xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; + xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) + / PAGE_SIZE; /* Use rest of head for padding and remaining ops: */ buf->tail[0].iov_base = xdr->p; @@ -3147,9 +3179,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; v = 0; - thislen = (void *)xdr->end - (void *)xdr->p; - if (len < thislen) - thislen = len; + thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); p = xdr_reserve_space(xdr, (thislen+3)&~3); WARN_ON_ONCE(!p); resp->rqstp->rq_vec[v].iov_base = p; @@ -3216,10 +3246,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, xdr_commit_encode(xdr); maxcount = svc_max_payload(resp->rqstp); - if (maxcount > xdr->buf->buflen - xdr->buf->len) - maxcount = xdr->buf->buflen - xdr->buf->len; - if (maxcount > read->rd_length) - maxcount = read->rd_length; + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); if (!read->rd_filp) { err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, @@ -3333,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } maxcount = min_t(int, maxcount-16, bytes_left); + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + if (!readdir->rd_dircount) + readdir->rd_dircount = INT_MAX; + readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; @@ -3763,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_seek *seek) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 8); + *p++ = cpu_to_be32(seek->seek_eof); + p = xdr_encode_hyper(p, seek->seek_pos); + + return nfserr; +} + +static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { return nfserr; @@ -3834,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, + [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, }; /* @@ -3937,8 +3999,6 @@ status: * * XDR note: do not encode rp->rp_buflen: the buffer contains the * previously sent already encoded operation. - * - * called with nfs4_lock_state() held */ void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) @@ -3977,9 +4037,8 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) kfree(args->tmpp); args->tmpp = NULL; while (args->to_free) { - struct tmpbuf *tb = args->to_free; + struct svcxdr_tmpbuf *tb = args->to_free; args->to_free = tb->next; - tb->release(tb->buf); kfree(tb); } return 1; @@ -4012,7 +4071,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo /* * All that remains is to write the tag and operation count... */ - struct nfsd4_compound_state *cs = &resp->cstate; struct xdr_buf *buf = resp->xdr.buf; WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + @@ -4026,19 +4084,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); - if (nfsd4_has_session(cs)) { - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct nfs4_client *clp = cs->session->se_client; - if (cs->status != nfserr_replay_cache) { - nfsd4_store_cache_entry(resp); - cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; - } - /* Renew the clientid on success and on replay */ - spin_lock(&nn->client_lock); - nfsd4_put_session(cs->session); - spin_unlock(&nn->client_lock); - put_client_renew(clp); - } + nfsd4_sequence_done(resp); return 1; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6040da8830ff..122f69185ef5 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -27,8 +27,12 @@ */ #define TARGET_BUCKET_SIZE 64 -static struct hlist_head * cache_hash; -static struct list_head lru_head; +struct nfsd_drc_bucket { + struct list_head lru_head; + spinlock_t cache_lock; +}; + +static struct nfsd_drc_bucket *drc_hashtbl; static struct kmem_cache *drc_slab; /* max number of entries allowed in the cache */ @@ -36,6 +40,7 @@ static unsigned int max_drc_entries; /* number of significant bits in the hash value */ static unsigned int maskbits; +static unsigned int drc_hashsize; /* * Stats and other tracking of on the duplicate reply cache. All of these and @@ -43,7 +48,7 @@ static unsigned int maskbits; */ /* total number of entries */ -static unsigned int num_drc_entries; +static atomic_t num_drc_entries; /* cache misses due only to checksum comparison failures */ static unsigned int payload_misses; @@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = { * A cache entry is "single use" if c_state == RC_INPROG * Otherwise, it when accessing _prev or _next, the lock must be held. */ -static DEFINE_SPINLOCK(cache_lock); static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); /* @@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } +static u32 +nfsd_cache_hash(__be32 xid) +{ + return hash_32(be32_to_cpu(xid), maskbits); +} + static struct svc_cacherep * nfsd_reply_cache_alloc(void) { @@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void) rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; INIT_LIST_HEAD(&rp->c_lru); - INIT_HLIST_NODE(&rp->c_hash); } return rp; } @@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } - if (!hlist_unhashed(&rp->c_hash)) - hlist_del(&rp->c_hash); list_del(&rp->c_lru); - --num_drc_entries; + atomic_dec(&num_drc_entries); drc_mem_usage -= sizeof(*rp); kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct svc_cacherep *rp) +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); nfsd_reply_cache_free_locked(rp); - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); } int nfsd_reply_cache_init(void) { unsigned int hashsize; + unsigned int i; - INIT_LIST_HEAD(&lru_head); max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; + atomic_set(&num_drc_entries, 0); hashsize = nfsd_hashsize(max_drc_entries); maskbits = ilog2(hashsize); @@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void) if (!drc_slab) goto out_nomem; - cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); - if (!cache_hash) + drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); + if (!drc_hashtbl) goto out_nomem; + for (i = 0; i < hashsize; i++) { + INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); + spin_lock_init(&drc_hashtbl[i].cache_lock); + } + drc_hashsize = hashsize; return 0; out_nomem: @@ -184,17 +196,22 @@ out_nomem: void nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; + unsigned int i; unregister_shrinker(&nfsd_reply_cache_shrinker); cancel_delayed_work_sync(&cache_cleaner); - while (!list_empty(&lru_head)) { - rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(rp); + for (i = 0; i < drc_hashsize; i++) { + struct list_head *head = &drc_hashtbl[i].lru_head; + while (!list_empty(head)) { + rp = list_first_entry(head, struct svc_cacherep, c_lru); + nfsd_reply_cache_free_locked(rp); + } } - kfree (cache_hash); - cache_hash = NULL; + kfree (drc_hashtbl); + drc_hashtbl = NULL; + drc_hashsize = 0; if (drc_slab) { kmem_cache_destroy(drc_slab); @@ -207,56 +224,63 @@ void nfsd_reply_cache_shutdown(void) * not already scheduled. */ static void -lru_put_end(struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { rp->c_timestamp = jiffies; - list_move_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &b->lru_head); schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } -/* - * Move a cache entry from one hash list to another - */ -static void -hash_refile(struct svc_cacherep *rp) -{ - hlist_del_init(&rp->c_hash); - hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); -} - -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. - */ static long -prune_cache_entries(void) +prune_bucket(struct nfsd_drc_bucket *b) { struct svc_cacherep *rp, *tmp; long freed = 0; - list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { /* * Don't free entries attached to calls that are still * in-progress, but do keep scanning the list. */ if (rp->c_state == RC_INPROG) continue; - if (num_drc_entries <= max_drc_entries && + if (atomic_read(&num_drc_entries) <= max_drc_entries && time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; nfsd_reply_cache_free_locked(rp); freed++; } + return freed; +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static long +prune_cache_entries(void) +{ + unsigned int i; + long freed = 0; + bool cancel = true; + + for (i = 0; i < drc_hashsize; i++) { + struct nfsd_drc_bucket *b = &drc_hashtbl[i]; + + if (list_empty(&b->lru_head)) + continue; + spin_lock(&b->cache_lock); + freed += prune_bucket(b); + if (!list_empty(&b->lru_head)) + cancel = false; + spin_unlock(&b->cache_lock); + } /* - * Conditionally rearm the job. If we cleaned out the list, then - * cancel any pending run (since there won't be any work to do). - * Otherwise, we rearm the job or modify the existing one to run in - * RC_EXPIRE since we just ran the pruner. + * Conditionally rearm the job to run in RC_EXPIRE since we just + * ran the pruner. */ - if (list_empty(&lru_head)) - cancel_delayed_work(&cache_cleaner); - else + if (!cancel) mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); return freed; } @@ -264,32 +288,19 @@ prune_cache_entries(void) static void cache_cleaner_func(struct work_struct *unused) { - spin_lock(&cache_lock); prune_cache_entries(); - spin_unlock(&cache_lock); } static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long num; - - spin_lock(&cache_lock); - num = num_drc_entries; - spin_unlock(&cache_lock); - - return num; + return atomic_read(&num_drc_entries); } static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long freed; - - spin_lock(&cache_lock); - freed = prune_cache_entries(); - spin_unlock(&cache_lock); - return freed; + return prune_cache_entries(); } /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes @@ -327,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp) static bool nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) { - /* Check RPC header info first */ - if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || - rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers || - rqstp->rq_arg.len != rp->c_len || - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + /* Check RPC XID first */ + if (rqstp->rq_xid != rp->c_xid) return false; - /* compare checksum of NFS data */ if (csum != rp->c_csum) { ++payload_misses; return false; } + /* Other discriminators */ + if (rqstp->rq_proc != rp->c_proc || + rqstp->rq_prot != rp->c_prot || + rqstp->rq_vers != rp->c_vers || + rqstp->rq_arg.len != rp->c_len || + !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || + rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + return false; + return true; } @@ -350,14 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) * NULL on failure. */ static struct svc_cacherep * -nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) +nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, + __wsum csum) { struct svc_cacherep *rp, *ret = NULL; - struct hlist_head *rh; + struct list_head *rh = &b->lru_head; unsigned int entries = 0; - rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)]; - hlist_for_each_entry(rp, rh, c_hash) { + list_for_each_entry(rp, rh, c_lru) { ++entries; if (nfsd_cache_match(rqstp, csum, rp)) { ret = rp; @@ -368,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) /* tally hash chain length stats */ if (entries > longest_chain) { longest_chain = entries; - longest_chain_cachesize = num_drc_entries; + longest_chain_cachesize = atomic_read(&num_drc_entries); } else if (entries == longest_chain) { /* prefer to keep the smallest cachesize possible here */ - longest_chain_cachesize = min(longest_chain_cachesize, - num_drc_entries); + longest_chain_cachesize = min_t(unsigned int, + longest_chain_cachesize, + atomic_read(&num_drc_entries)); } return ret; @@ -394,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) vers = rqstp->rq_vers, proc = rqstp->rq_proc; __wsum csum; + u32 hash = nfsd_cache_hash(xid); + struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; unsigned long age; int type = rqstp->rq_cachetype; int rtn = RC_DOIT; @@ -411,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) * preallocate an entry. */ rp = nfsd_reply_cache_alloc(); - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); if (likely(rp)) { - ++num_drc_entries; + atomic_inc(&num_drc_entries); drc_mem_usage += sizeof(*rp); } /* go ahead and prune the cache */ - prune_cache_entries(); + prune_bucket(b); - found = nfsd_cache_search(rqstp, csum); + found = nfsd_cache_search(b, rqstp, csum); if (found) { if (likely(rp)) nfsd_reply_cache_free_locked(rp); @@ -445,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp->c_len = rqstp->rq_arg.len; rp->c_csum = csum; - hash_refile(rp); - lru_put_end(rp); + lru_put_end(b, rp); /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { @@ -456,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) } rp->c_type = RC_NOCACHE; out: - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); return rtn; found_entry: nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ age = jiffies - rp->c_timestamp; - lru_put_end(rp); + lru_put_end(b, rp); rtn = RC_DROPIT; /* Request being processed or excessive rexmits */ @@ -518,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) { struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; + u32 hash; + struct nfsd_drc_bucket *b; int len; size_t bufsize = 0; if (!rp) return; + hash = nfsd_cache_hash(rp->c_xid); + b = &drc_hashtbl[hash]; + len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); len >>= 2; /* Don't cache excessive amounts of data and XDR failures */ if (!statp || len > (256 >> 2)) { - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } @@ -544,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) bufsize = len << 2; cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); if (!cachv->iov_base) { - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } cachv->iov_len = bufsize; memcpy(cachv->iov_base, statp, bufsize); break; case RC_NOCACHE: - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); drc_mem_usage += bufsize; - lru_put_end(rp); + lru_put_end(b, rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; rp->c_state = RC_DONE; - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); return; } @@ -591,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) */ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - spin_lock(&cache_lock); seq_printf(m, "max entries: %u\n", max_drc_entries); - seq_printf(m, "num entries: %u\n", num_drc_entries); + seq_printf(m, "num entries: %u\n", + atomic_read(&num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << maskbits); seq_printf(m, "mem usage: %u\n", drc_mem_usage); seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); @@ -602,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "payload misses: %u\n", payload_misses); seq_printf(m, "longest chain len: %u\n", longest_chain); seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); - spin_unlock(&cache_lock); return 0; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 51844048937f..ca73ca79a0ee 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -39,6 +39,7 @@ enum { NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, + NFSD_MaxConnections, NFSD_SupportedEnctypes, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] @@ -48,6 +49,7 @@ enum { NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, + NFSD_V4EndGrace, #endif }; @@ -62,10 +64,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); +static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { @@ -77,10 +81,12 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Versions] = write_versions, [NFSD_Ports] = write_ports, [NFSD_MaxBlkSize] = write_maxblksize, + [NFSD_MaxConnections] = write_maxconn, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, [NFSD_RecoveryDir] = write_recoverydir, + [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; @@ -369,8 +375,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (maxsize < NFS_FHSIZE) return -EINVAL; - if (maxsize > NFS3_FHSIZE) - maxsize = NFS3_FHSIZE; + maxsize = min(maxsize, NFS3_FHSIZE); if (qword_get(&mesg, mesg, size)>0) return -EINVAL; @@ -871,10 +876,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) /* force bsize into allowed range and * required alignment. */ - if (bsize < 1024) - bsize = 1024; - if (bsize > NFSSVC_MAXBLKSIZE) - bsize = NFSSVC_MAXBLKSIZE; + bsize = max_t(int, bsize, 1024); + bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) { @@ -889,6 +892,44 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) nfsd_max_blksize); } +/** + * write_maxconn - Set or report the current max number of connections + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of max_connections setting + * for this net namespace; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxconn(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int maxconn = nn->max_connections; + + if (size > 0) { + int rv = get_uint(&mesg, &maxconn); + + if (rv) + return rv; + nn->max_connections = maxconn; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); +} + #ifdef CONFIG_NFSD_V4 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time, struct nfsd_net *nn) @@ -1039,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) return rv; } +/** + * write_v4_end_grace - release grace period for nfsd's v4.x lock manager + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: any value + * size: non-zero length of C string in @buf + * Output: + * passed-in buffer filled with "Y" or "N" with a newline + * and NULL-terminated C string. This indicates whether + * the grace period has ended in the current net + * namespace. Return code is the size in bytes of the + * string. Writing a string that starts with 'Y', 'y', or + * '1' to the file will end the grace period for nfsd's v4 + * lock manager. + */ +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) +{ + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (size > 0) { + switch(buf[0]) { + case 'Y': + case 'y': + case '1': + nfsd4_end_grace(nn); + break; + default: + return -EINVAL; + } + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", + nn->grace_ended ? 'Y' : 'N'); +} + #endif /*----------------------------------------------------------------------------*/ @@ -1064,6 +1146,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ @@ -1071,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} }; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 847daf37e566..747f3b95bd11 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) #define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) #define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) -#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP) +#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) #define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) #define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ec8393418154..88026fc6a981 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -162,7 +162,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) /* deprecated, convert to type 3 */ len = key_len(FSID_ENCODE_DEV)/4; fh->fh_fsid_type = FSID_ENCODE_DEV; - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); + /* + * struct knfsd_fh uses host-endian fields, which are + * sometimes used to hold net-endian values. This + * confuses sparse, so we must use __force here to + * keep it from complaining. + */ + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), + ntohl((__force __be32)fh->fh_fsid[1]))); fh->fh_fsid[1] = fh->fh_fsid[2]; } data_left -= len; @@ -202,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) * fix that case easily. */ struct cred *new = prepare_creds(); - if (!new) - return nfserrno(-ENOMEM); + if (!new) { + error = nfserrno(-ENOMEM); + goto out; + } new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); @@ -539,8 +548,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, dentry); fhp->fh_dentry = dget(dentry); /* our internal copy */ - fhp->fh_export = exp; - cache_get(&exp->h); + fhp->fh_export = exp_get(exp); if (fhp->fh_handle.fh_version == 0xca) { /* old style filehandle please */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 2e89e70ac15c..08236d70c667 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -73,8 +73,15 @@ enum fsid_source { extern enum fsid_source fsid_source(struct svc_fh *fhp); -/* This might look a little large to "inline" but in all calls except +/* + * This might look a little large to "inline" but in all calls except * one, 'vers' is constant so moste of the function disappears. + * + * In some cases the values are considered to be host endian and in + * others, net endian. fsidv is always considered to be u32 as the + * callers don't know which it will be. So we must use __force to keep + * sparse from complaining. Since these values are opaque to the + * client, that shouldn't be a problem. */ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, u32 fsid, unsigned char *uuid) @@ -82,7 +89,7 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, u32 *up; switch(vers) { case FSID_DEV: - fsidv[0] = htonl((MAJOR(dev)<<16) | + fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) | MINOR(dev)); fsidv[1] = ino_t_to_u32(ino); break; @@ -90,8 +97,8 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, fsidv[0] = fsid; break; case FSID_MAJOR_MINOR: - fsidv[0] = htonl(MAJOR(dev)); - fsidv[1] = htonl(MINOR(dev)); + fsidv[0] = (__force __u32)htonl(MAJOR(dev)); + fsidv[1] = (__force __u32)htonl(MINOR(dev)); fsidv[2] = ino_t_to_u32(ino); break; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 54c6b3d3cc79..b8680738f588 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -403,12 +403,13 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp, fh_init(&newfh, NFS_FHSIZE); /* - * Create the link, look up new file and set attrs. + * Crazy hack: the request fits in a page, and already-decoded + * attributes follow argp->tname, so it's safe to just write a + * null to ensure it's null-terminated: */ + argp->tname[argp->tlen] = '\0'; nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, - argp->tname, argp->tlen, - &newfh, &argp->attrs); - + argp->tname, &newfh); fh_put(&argp->ffh); fh_put(&newfh); @@ -716,6 +717,7 @@ nfserrno (int errno) { nfserr_noent, -ENOENT }, { nfserr_io, -EIO }, { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, @@ -743,6 +745,7 @@ nfserrno (int errno) { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, }; int i; @@ -750,7 +753,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno); + WARN(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1879e43f2868..752d56bbe0ba 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -221,7 +221,8 @@ static int nfsd_startup_generic(int nrservs) */ ret = nfsd_racache_init(2*nrservs); if (ret) - return ret; + goto dec_users; + ret = nfs4_state_start(); if (ret) goto out_racache; @@ -229,6 +230,8 @@ static int nfsd_startup_generic(int nrservs) out_racache: nfsd_racache_shutdown(); +dec_users: + nfsd_users--; return ret; } @@ -405,6 +408,7 @@ int nfsd_create_serv(struct net *net) if (nn->nfsd_serv == NULL) return -ENOMEM; + nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); if (error < 0) { svc_destroy(nn->nfsd_serv); @@ -469,8 +473,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { - if (nthreads[i] > NFSD_MAXSERVS) - nthreads[i] = NFSD_MAXSERVS; + nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { @@ -519,11 +522,11 @@ nfsd_svc(int nrservs, struct net *net) mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); - if (nrservs <= 0) - nrservs = 0; - if (nrservs > NFSD_MAXSERVS) - nrservs = NFSD_MAXSERVS; + + nrservs = max(nrservs, 0); + nrservs = min(nrservs, NFSD_MAXSERVS); error = 0; + if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; @@ -564,6 +567,7 @@ nfsd(void *vrqstp) struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int err; /* Lock module and set up kernel thread */ @@ -597,6 +601,9 @@ nfsd(void *vrqstp) * The main request loop */ for (;;) { + /* Update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nn->max_connections; + /* * Find a socket with data available and call its * recvfrom routine. diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 1ac306b769df..412d7061f9e5 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -257,8 +257,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, len = args->count = ntohl(*p++); p++; /* totalcount - unused */ - if (len > NFSSVC_MAXBLKSIZE_V2) - len = NFSSVC_MAXBLKSIZE_V2; + len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); /* set up somewhere to store response. * We take pages, put them on reslist and include in iovec @@ -268,7 +267,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct page *p = *(rqstp->rq_next_page++); rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); len -= rqstp->rq_vec[v].iov_len; v++; } @@ -400,9 +399,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); - if (args->count > PAGE_SIZE) - args->count = PAGE_SIZE; - + args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -516,10 +513,11 @@ nfssvc_encode_entry(void *ccdv, const char *name, } if (cd->offset) *cd->offset = htonl(offset); - if (namlen > NFS2_MAXNAMLEN) - namlen = NFS2_MAXNAMLEN;/* truncate filename */ + /* truncate filename */ + namlen = min(namlen, NFS2_MAXNAMLEN); slen = XDR_QUADLEN(namlen); + if ((buflen = cd->buflen - slen - 4) < 0) { cd->common.err = nfserr_toosmall; return -EINVAL; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 374c66283ac5..2712042a66b1 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -62,17 +62,28 @@ typedef struct { (s)->si_generation struct nfsd4_callback { - void *cb_op; struct nfs4_client *cb_clp; struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; - const struct rpc_call_ops *cb_ops; + struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; bool cb_done; }; +struct nfsd4_callback_ops { + void (*prepare)(struct nfsd4_callback *); + int (*done)(struct nfsd4_callback *, struct rpc_task *); + void (*release)(struct nfsd4_callback *); +}; + +/* + * A core object that represents a "common" stateid. These are generally + * embedded within the different (more specific) stateid objects and contain + * fields that are of general use to any stateid. + */ struct nfs4_stid { + atomic_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -80,26 +91,50 @@ struct nfs4_stid { #define NFS4_CLOSED_STID 8 /* For a deleg stateid kept around only to process free_stateid's: */ #define NFS4_REVOKED_DELEG_STID 16 +#define NFS4_CLOSED_DELEG_STID 32 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); }; +/* + * Represents a delegation stateid. The nfs4_client holds references to these + * and they are put when it is being destroyed or when the delegation is + * returned by the client: + * + * o 1 reference as long as a delegation is still in force (taken when it's + * alloc'd, put when it's returned or revoked) + * + * o 1 reference as long as a recall rpc is in progress (taken when the lease + * is broken, put when the rpc exits) + * + * o 1 more ephemeral reference for each nfsd thread currently doing something + * with that delegation without holding the cl_lock + * + * If the server attempts to recall a delegation and the client doesn't do so + * before a timeout, the server may also revoke the delegation. In that case, + * the object will either be destroyed (v4.0) or moved to a per-client list of + * revoked delegations (v4.1+). + * + * This object is a superset of the nfs4_stid. + */ struct nfs4_delegation { struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ - atomic_t dl_count; /* ref count */ - struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; /* For recall: */ - struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; }; +#define cb_to_delegation(cb) \ + container_of(cb, struct nfs4_delegation, dl_recall) + /* client delegation callback info */ struct nfs4_cb_conn { /* SETCLIENTID info */ @@ -194,6 +229,11 @@ struct nfsd4_conn { unsigned char cn_flags; }; +/* + * Representation of a v4.1+ session. These are refcounted in a similar fashion + * to the nfs4_client. References are only taken when the server is actively + * working on the object (primarily during the processing of compounds). + */ struct nfsd4_session { atomic_t se_ref; struct list_head se_hash; /* hash by sessionid */ @@ -212,8 +252,6 @@ struct nfsd4_session { struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; -extern void nfsd4_put_session(struct nfsd4_session *ses); - /* formatted contents of nfs4_sessionid */ struct nfsd4_sessionid { clientid_t clientid; @@ -225,17 +263,35 @@ struct nfsd4_sessionid { /* * struct nfs4_client - one per client. Clientids live here. - * o Each nfs4_client is hashed by clientid. * - * o Each nfs4_clients is also hashed by name - * (the opaque quantity initially sent by the client to identify itself). + * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) + * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped. + * Each nfsd_net_ns object contains a set of these and they are tracked via + * short and long form clientid. They are hashed and searched for under the + * per-nfsd_net client_lock spinlock. + * + * References to it are only held during the processing of compounds, and in + * certain other operations. In their "resting state" they have a refcount of + * 0. If they are not renewed within a lease period, they become eligible for + * destruction by the laundromat. + * + * These objects can also be destroyed prematurely by the fault injection code, + * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * Care is taken *not* to do this however when the objects have an elevated + * refcount. + * + * o Each nfs4_client is hashed by clientid + * + * o Each nfs4_clients is also hashed by name (the opaque quantity initially + * sent by the client to identify itself). * - * o cl_perclient list is used to ensure no dangling stateowner references - * when we expire the nfs4_client + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client */ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct rb_node cl_namenode; /* link into by-name trees */ + struct list_head *cl_ownerstr_hashtbl; struct list_head cl_openowners; struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; @@ -258,6 +314,7 @@ struct nfs4_client { #define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ #define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ #define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ +#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) unsigned long cl_flags; @@ -329,21 +386,43 @@ struct nfs4_replay { unsigned int rp_buflen; char *rp_buf; struct knfsd_fh rp_openfh; + struct mutex rp_mutex; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; +struct nfs4_stateowner; + +struct nfs4_stateowner_operations { + void (*so_unhash)(struct nfs4_stateowner *); + void (*so_free)(struct nfs4_stateowner *); +}; + +/* + * A core object that represents either an open or lock owner. The object and + * lock owner objects have one of these embedded within them. Refcounts and + * other fields common to both owner types are contained within these + * structures. + */ struct nfs4_stateowner { - struct list_head so_strhash; /* hash by op_name */ - struct list_head so_stateids; - struct nfs4_client * so_client; - /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + struct list_head so_strhash; + struct list_head so_stateids; + struct nfs4_client *so_client; + const struct nfs4_stateowner_operations *so_ops; + /* after increment in nfsd4_bump_seqid, represents the next * sequence id expected from the client: */ - u32 so_seqid; - struct xdr_netobj so_owner; /* open owner name */ - struct nfs4_replay so_replay; - bool so_is_open_owner; + atomic_t so_count; + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + struct nfs4_replay so_replay; + bool so_is_open_owner; }; +/* + * When a file is opened, the client provides an open state owner opaque string + * that indicates the "owner" of that open. These objects are refcounted. + * References to it are held by each open state associated with it. This object + * is a superset of the nfs4_stateowner struct. + */ struct nfs4_openowner { struct nfs4_stateowner oo_owner; /* must be first field */ struct list_head oo_perclient; @@ -358,15 +437,17 @@ struct nfs4_openowner { struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 -#define NFS4_OO_NEW 4 unsigned char oo_flags; }; +/* + * Represents a generic "lockowner". Similar to an openowner. References to it + * are held by the lock stateids that are created on its behalf. This object is + * a superset of the nfs4_stateowner struct (or would be if it needed any extra + * fields). + */ struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ - struct list_head lo_owner_ino_hash; /* hash by owner,file */ - struct list_head lo_perstateid; - struct list_head lo_list; /* for temporary uses */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -379,9 +460,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) return container_of(so, struct nfs4_lockowner, lo_owner); } -/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */ +/* + * nfs4_file: a file opened by some number of (open) nfs4_stateowners. + * + * These objects are global. nfsd only keeps one instance of a nfs4_file per + * inode (though it may keep multiple file descriptors open per inode). These + * are tracked in the file_hashtbl which is protected by the state_lock + * spinlock. + */ struct nfs4_file { atomic_t fi_ref; + spinlock_t fi_lock; struct hlist_node fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; @@ -395,49 +484,35 @@ struct nfs4_file { * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; + u32 fi_share_deny; struct file *fi_deleg_file; - struct file_lock *fi_lease; atomic_t fi_delegees; - struct inode *fi_inode; + struct knfsd_fh fi_fhandle; bool fi_had_conflict; }; -/* XXX: for first cut may fall back on returning file that doesn't work - * at all? */ -static inline struct file *find_writeable_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_WRONLY]) - return f->fi_fds[O_WRONLY]; - return f->fi_fds[O_RDWR]; -} - -static inline struct file *find_readable_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_RDONLY]) - return f->fi_fds[O_RDONLY]; - return f->fi_fds[O_RDWR]; -} - -static inline struct file *find_any_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - else if (f->fi_fds[O_WRONLY]) - return f->fi_fds[O_WRONLY]; - else - return f->fi_fds[O_RDONLY]; -} - -/* "ol" stands for "Open or Lock". Better suggestions welcome. */ +/* + * A generic struct representing either a open or lock stateid. The nfs4_client + * holds a reference to each of these objects, and they in turn hold a + * reference to their respective stateowners. The client's reference is + * released in response to a close or unlock (depending on whether it's an open + * or lock stateid) or when the client is being destroyed. + * + * In the case of v4.0 open stateids, these objects are preserved for a little + * while after close in order to handle CLOSE replays. Those are eventually + * reclaimed via a LRU scheme by the laundromat. + * + * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock". + * Better suggestions welcome. + */ struct nfs4_ol_stateid { struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; - struct list_head st_lockowners; + struct list_head st_locks; struct nfs4_stateowner * st_stateowner; - struct nfs4_file * st_file; - unsigned long st_access_bmap; - unsigned long st_deny_bmap; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; }; @@ -450,33 +525,43 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 +enum nfsd4_cb_op { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, +}; + + struct nfsd4_compound_state; struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); -extern void nfs4_lock_state(void); -extern void nfs4_unlock_state(void); +void nfs4_put_stid(struct nfs4_stid *s); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); -extern void nfsd4_init_callback(struct nfsd4_callback *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); -extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); +extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); -extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); -extern void put_client_renew(struct nfs4_client *clp); + +/* grace period management */ +void nfsd4_end_grace(struct nfsd_net *nn); /* nfs4recover operations */ extern int nfsd4_client_tracking_init(struct net *net); @@ -484,25 +569,30 @@ extern void nfsd4_client_tracking_exit(struct net *net); extern void nfsd4_client_record_create(struct nfs4_client *clp); extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); -extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); +extern void nfsd4_record_grace_done(struct nfsd_net *nn); /* nfs fault injection functions */ #ifdef CONFIG_NFSD_FAULT_INJECTION int nfsd_fault_inject_init(void); void nfsd_fault_inject_cleanup(void); -u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); -struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t); - -u64 nfsd_forget_client(struct nfs4_client *, u64); -u64 nfsd_forget_client_locks(struct nfs4_client*, u64); -u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); -u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); -u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); - -u64 nfsd_print_client(struct nfs4_client *, u64); -u64 nfsd_print_client_locks(struct nfs4_client *, u64); -u64 nfsd_print_client_openowners(struct nfs4_client *, u64); -u64 nfsd_print_client_delegations(struct nfs4_client *, u64); + +u64 nfsd_inject_print_clients(void); +u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_clients(u64); + +u64 nfsd_inject_print_locks(void); +u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_locks(u64); + +u64 nfsd_inject_print_openowners(void); +u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_openowners(u64); + +u64 nfsd_inject_print_delegations(void); +u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_delegations(u64); +u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_recall_delegations(u64); #else /* CONFIG_NFSD_FAULT_INJECTION */ static inline int nfsd_fault_inject_init(void) { return 0; } static inline void nfsd_fault_inject_cleanup(void) {} diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d49c778faecb..989129e2d6ea 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -189,8 +189,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); dparent = fhp->fh_dentry; - exp = fhp->fh_export; - exp_get(exp); + exp = exp_get(fhp->fh_export); /* Lookup the name, but don't follow links */ if (isdotent(name, len)) { @@ -446,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, if (err) goto out; size_change = 1; + + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + if (iap->ia_size != i_size_read(inode)) + iap->ia_valid |= ATTR_MTIME; } iap->ia_valid |= ATTR_CTIME; @@ -464,7 +473,7 @@ out_put_write_access: if (size_change) put_write_access(inode); if (!err) - commit_metadata(fhp); + err = nfserrno(commit_metadata(fhp)); out: return err; } @@ -650,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, { struct path path; struct inode *inode; + struct file *file; int flags = O_RDONLY|O_LARGEFILE; __be32 err; int host_err = 0; @@ -704,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, else flags = O_WRONLY|O_LARGEFILE; } - *filp = dentry_open(&path, flags, current_cred()); - if (IS_ERR(*filp)) { - host_err = PTR_ERR(*filp); - *filp = NULL; - } else { - host_err = ima_file_check(*filp, may_flags, 0); - if (may_flags & NFSD_MAY_64BIT_COOKIE) - (*filp)->f_mode |= FMODE_64BITHASH; - else - (*filp)->f_mode |= FMODE_32BITHASH; + file = dentry_open(&path, flags, current_cred()); + if (IS_ERR(file)) { + host_err = PTR_ERR(file); + goto out_nfserr; } + host_err = ima_file_check(file, may_flags, 0); + if (host_err) { + nfsd_close(file); + goto out_nfserr; + } + + if (may_flags & NFSD_MAY_64BIT_COOKIE) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + + *filp = file; out_nfserr: err = nfserrno(host_err); out: @@ -820,7 +836,8 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) +static __be32 +nfsd_finish_read(struct file *file, unsigned long *count, int host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; @@ -831,7 +848,7 @@ __be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) return nfserrno(host_err); } -int nfsd_splice_read(struct svc_rqst *rqstp, +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct file *file, loff_t offset, unsigned long *count) { struct splice_desc sd = { @@ -847,7 +864,7 @@ int nfsd_splice_read(struct svc_rqst *rqstp, return nfsd_finish_read(file, count, host_err); } -int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, +__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { mm_segment_t oldfs; @@ -1121,7 +1138,8 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, iap->ia_valid &= ~(ATTR_UID|ATTR_GID); if (iap->ia_valid) return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); - return 0; + /* Callers expect file metadata to be committed here */ + return nfserrno(commit_metadata(resfhp)); } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1253,9 +1271,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, resfhp, iap); /* - * nfsd_setattr already committed the child. Transactional filesystems - * had a chance to commit changes for both parent and child - * simultaneously making the following commit_metadata a noop. + * nfsd_create_setattr already committed the child. Transactional + * filesystems had a chance to commit changes for both parent and + * child * simultaneously making the following commit_metadata a + * noop. */ err2 = nfserrno(commit_metadata(fhp)); if (err2) @@ -1426,7 +1445,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, resfhp, iap); /* - * nfsd_setattr already committed the child (and possibly also the parent). + * nfsd_create_setattr already committed the child + * (and possibly also the parent). */ if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1504,16 +1524,15 @@ out_nfserr: __be32 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, - char *path, int plen, - struct svc_fh *resfhp, - struct iattr *iap) + char *path, + struct svc_fh *resfhp) { struct dentry *dentry, *dnew; __be32 err, cerr; int host_err; err = nfserr_noent; - if (!flen || !plen) + if (!flen || path[0] == '\0') goto out; err = nfserr_exist; if (isdotent(fname, flen)) @@ -1534,18 +1553,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - if (unlikely(path[plen] != 0)) { - char *path_alloced = kmalloc(plen+1, GFP_KERNEL); - if (path_alloced == NULL) - host_err = -ENOMEM; - else { - strncpy(path_alloced, path, plen); - path_alloced[plen] = 0; - host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced); - kfree(path_alloced); - } - } else - host_err = vfs_symlink(dentry->d_inode, dnew, path); + host_err = vfs_symlink(dentry->d_inode, dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -2093,8 +2101,7 @@ nfsd_racache_init(int cache_size) if (raparm_hash[0].pb_head) return 0; nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - if (nperbucket < 2) - nperbucket = 2; + nperbucket = max(2, nperbucket); cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 91b6ae3f658b..c2ff3f14e5f6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -74,9 +74,9 @@ struct raparms; __be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, struct file **, struct raparms **); void nfsd_put_tmp_read_open(struct file *, struct raparms *); -int nfsd_splice_read(struct svc_rqst *, +__be32 nfsd_splice_read(struct svc_rqst *, struct file *, loff_t, unsigned long *); -int nfsd_readv(struct file *, loff_t, struct kvec *, int, +__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); @@ -85,8 +85,8 @@ __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, - char *name, int len, char *path, int plen, - struct svc_fh *res, struct iattr *); + char *name, int len, char *path, + struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); __be32 nfsd_rename(struct svc_rqst *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 18cbb6d9c8a9..5720e9457f33 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -55,6 +55,7 @@ struct nfsd4_compound_state { struct svc_fh current_fh; struct svc_fh save_fh; struct nfs4_stateowner *replay_owner; + struct nfs4_client *clp; /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; @@ -107,8 +108,8 @@ struct nfsd4_create { u32 cr_type; /* request */ union { /* request */ struct { - u32 namelen; - char *name; + u32 datalen; + char *data; } link; /* NF4LNK */ struct { u32 specdata1; @@ -121,8 +122,8 @@ struct nfsd4_create { struct nfs4_acl *cr_acl; struct xdr_netobj cr_label; }; -#define cr_linklen u.link.namelen -#define cr_linkname u.link.name +#define cr_datalen u.link.datalen +#define cr_data u.link.data #define cr_specdata1 u.dev.specdata1 #define cr_specdata2 u.dev.specdata2 @@ -427,6 +428,17 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_seek { + /* request */ + stateid_t seek_stateid; + loff_t seek_offset; + u32 seek_whence; + + /* response */ + u32 seek_eof; + loff_t seek_pos; +}; + struct nfsd4_op { int opnum; __be32 status; @@ -472,12 +484,23 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + + /* NFSv4.2 */ + struct nfsd4_seek seek; } u; struct nfs4_replay * replay; }; bool nfsd4_cache_this_op(struct nfsd4_op *); +/* + * Memory needed just for the duration of processing one compound: + */ +struct svcxdr_tmpbuf { + struct svcxdr_tmpbuf *next; + char buf[]; +}; + struct nfsd4_compoundargs { /* scratch variables for XDR decode */ __be32 * p; @@ -486,11 +509,7 @@ struct nfsd4_compoundargs { int pagelen; __be32 tmp[8]; __be32 * tmpp; - struct tmpbuf { - struct tmpbuf *next; - void (*release)(const void *); - void *buf; - } *to_free; + struct svcxdr_tmpbuf *to_free; struct svc_rqst *rqstp; @@ -574,7 +593,6 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid_confirm *setclientid_confirm); -extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); @@ -585,6 +603,7 @@ extern __be32 nfsd4_create_session(struct svc_rqst *, extern __be32 nfsd4_sequence(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_sequence *); +extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp); extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); @@ -594,7 +613,9 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open, struct nfsd_net *nn); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); -extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); +extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); +extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, @@ -625,6 +646,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); + #endif /* diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile index 85c98737a146..fc603e0431bb 100644 --- a/fs/nilfs2/Makefile +++ b/fs/nilfs2/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ btnode.o bmap.o btree.o direct.o dat.o recovery.o \ the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ - ifile.o alloc.o gcinode.o ioctl.o + ifile.o alloc.o gcinode.o ioctl.o sysfs.o diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6252b173a465..d071e7f23de2 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -24,6 +24,7 @@ #include <linux/buffer_head.h> #include <linux/gfp.h> #include <linux/mpage.h> +#include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/aio.h> #include "nilfs.h" @@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) static int nilfs_set_page_dirty(struct page *page) { + struct inode *inode = page->mapping->host; int ret = __set_page_dirty_nobuffers(page); if (page_has_buffers(page)) { - struct inode *inode = page->mapping->host; unsigned nr_dirty = 0; struct buffer_head *bh, *head; @@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page) if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); + } else if (ret) { + unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 9bc72dec3fa6..0696161bf59d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -320,6 +320,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); int nilfs_init_gcinode(struct inode *inode); void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); +/* sysfs.c */ +int __init nilfs_sysfs_init(void); +void nilfs_sysfs_exit(void); +int nilfs_sysfs_create_device_group(struct super_block *); +void nilfs_sysfs_delete_device_group(struct the_nilfs *); +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *); +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *); + /* * Inodes and files operations */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8c532b2ca3ab..228f5bdf0772 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb, iput(inode); } } else { - dentry = d_obtain_alias(inode); + dentry = d_obtain_root(inode); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto failed_dentry; @@ -1452,13 +1452,19 @@ static int __init init_nilfs_fs(void) if (err) goto fail; - err = register_filesystem(&nilfs_fs_type); + err = nilfs_sysfs_init(); if (err) goto free_cachep; + err = register_filesystem(&nilfs_fs_type); + if (err) + goto deinit_sysfs_entry; + printk(KERN_INFO "NILFS version 2 loaded\n"); return 0; +deinit_sysfs_entry: + nilfs_sysfs_exit(); free_cachep: nilfs_destroy_cachep(); fail: @@ -1468,6 +1474,7 @@ fail: static void __exit exit_nilfs_fs(void) { nilfs_destroy_cachep(); + nilfs_sysfs_exit(); unregister_filesystem(&nilfs_fs_type); } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c new file mode 100644 index 000000000000..bbb0dcc35905 --- /dev/null +++ b/fs/nilfs2/sysfs.c @@ -0,0 +1,1137 @@ +/* + * sysfs.c - sysfs support implementation. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#include <linux/kobject.h> + +#include "nilfs.h" +#include "mdt.h" +#include "sufile.h" +#include "cpfile.h" +#include "sysfs.h" + +/* /sys/fs/<nilfs>/ */ +static struct kset *nilfs_kset; + +#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ + struct tm res; \ + int count = 0; \ + time_to_tm(time_t_val, 0, &res); \ + res.tm_year += 1900; \ + res.tm_mon += 1; \ + count = scnprintf(buf, PAGE_SIZE, \ + "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ + res.tm_year, res.tm_mon, res.tm_mday, \ + res.tm_hour, res.tm_min, res.tm_sec);\ + count; \ +}) + +#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ +static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ + struct attribute *attr, char *buf) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->show ? a->show(a, nilfs, buf) : 0; \ +} \ +static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->store ? a->store(a, nilfs, buf, len) : 0; \ +} \ +static const struct sysfs_ops nilfs_##name##_attr_ops = { \ + .show = nilfs_##name##_attr_show, \ + .store = nilfs_##name##_attr_store, \ +}; + +#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ +static void nilfs_##name##_attr_release(struct kobject *kobj) \ +{ \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + complete(&subgroups->sg_##name##_kobj_unregister); \ +} \ +static struct kobj_type nilfs_##name##_ktype = { \ + .default_attrs = nilfs_##name##_attrs, \ + .sysfs_ops = &nilfs_##name##_attr_ops, \ + .release = nilfs_##name##_attr_release, \ +}; + +#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ +static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ +{ \ + struct kobject *parent; \ + struct kobject *kobj; \ + struct completion *kobj_unregister; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + int err; \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + kobj = &subgroups->sg_##name##_kobj; \ + kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \ + parent = &nilfs->ns_##parent_name##_kobj; \ + kobj->kset = nilfs_kset; \ + init_completion(kobj_unregister); \ + err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ + #name); \ + if (err) \ + return err; \ + return 0; \ +} \ +static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ +{ \ + kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ +} + +/************************************************************************ + * NILFS snapshot attrs * + ************************************************************************/ + +static ssize_t +nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->inodes_count)); +} + +static ssize_t +nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->blocks_count)); +} + +static const char snapshot_readme_str[] = + "The group contains details about mounted snapshot.\n\n" + "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n" + "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n"; + +static ssize_t +nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, snapshot_readme_str); +} + +NILFS_SNAPSHOT_RO_ATTR(inodes_count); +NILFS_SNAPSHOT_RO_ATTR(blocks_count); +NILFS_SNAPSHOT_RO_ATTR(README); + +static struct attribute *nilfs_snapshot_attrs[] = { + NILFS_SNAPSHOT_ATTR_LIST(inodes_count), + NILFS_SNAPSHOT_ATTR_LIST(blocks_count), + NILFS_SNAPSHOT_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->show ? a->show(a, root, buf) : 0; +} + +static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->store ? a->store(a, root, buf, len) : 0; +} + +static void nilfs_snapshot_attr_release(struct kobject *kobj) +{ + struct nilfs_root *root = container_of(kobj, struct nilfs_root, + snapshot_kobj); + complete(&root->snapshot_kobj_unregister); +} + +static const struct sysfs_ops nilfs_snapshot_attr_ops = { + .show = nilfs_snapshot_attr_show, + .store = nilfs_snapshot_attr_store, +}; + +static struct kobj_type nilfs_snapshot_ktype = { + .default_attrs = nilfs_snapshot_attrs, + .sysfs_ops = &nilfs_snapshot_attr_ops, + .release = nilfs_snapshot_attr_release, +}; + +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) +{ + struct the_nilfs *nilfs; + struct kobject *parent; + int err; + + nilfs = root->nilfs; + parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj; + root->snapshot_kobj.kset = nilfs_kset; + init_completion(&root->snapshot_kobj_unregister); + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + &nilfs->ns_dev_kobj, + "current_checkpoint"); + } else { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + parent, + "%llu", root->cno); + } + + if (err) + return err; + + return 0; +} + +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) +{ + kobject_del(&root->snapshot_kobj); +} + +/************************************************************************ + * NILFS mounted snapshots attrs * + ************************************************************************/ + +static const char mounted_snapshots_readme_str[] = + "The mounted_snapshots group contains group for\n" + "every mounted snapshot.\n"; + +static ssize_t +nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str); +} + +NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README); + +static struct attribute *nilfs_mounted_snapshots_attrs[] = { + NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev); + +/************************************************************************ + * NILFS checkpoints attrs * + ************************************************************************/ + +static ssize_t +nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 ncheckpoints; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + ncheckpoints = cpstat.cs_ncps; + + return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints); +} + +static ssize_t +nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nsnapshots; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + nsnapshots = cpstat.cs_nsss; + + return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots); +} + +static ssize_t +nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static const char checkpoints_readme_str[] = + "The checkpoints group contains attributes that describe\n" + "details about volume's checkpoints.\n\n" + "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n" + "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) next_checkpoint\n\tshow next checkpoint number.\n\n"; + +static ssize_t +nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, checkpoints_readme_str); +} + +NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number); +NILFS_CHECKPOINTS_RO_ATTR(snapshots_number); +NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(README); + +static struct attribute *nilfs_checkpoints_attrs[] = { + NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number), + NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number), + NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); +NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); +NILFS_DEV_INT_GROUP_FNS(checkpoints, dev); + +/************************************************************************ + * NILFS segments attrs * + ************************************************************************/ + +static ssize_t +nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments); +} + +static ssize_t +nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment); +} + +static ssize_t +nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs); +} + +static ssize_t +nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_sustat sustat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", + err); + return err; + } + + return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs); +} + +static const char segments_readme_str[] = + "The segments group contains attributes that describe\n" + "details about volume's segments.\n\n" + "(1) segments_number\n\tshow number of segments on volume.\n\n" + "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n" + "(3) clean_segments\n\tshow count of clean segments.\n\n" + "(4) dirty_segments\n\tshow count of dirty segments.\n\n"; + +static ssize_t +nilfs_segments_README_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, segments_readme_str); +} + +NILFS_SEGMENTS_RO_ATTR(segments_number); +NILFS_SEGMENTS_RO_ATTR(blocks_per_segment); +NILFS_SEGMENTS_RO_ATTR(clean_segments); +NILFS_SEGMENTS_RO_ATTR(dirty_segments); +NILFS_SEGMENTS_RO_ATTR(README); + +static struct attribute *nilfs_segments_attrs[] = { + NILFS_SEGMENTS_ATTR_LIST(segments_number), + NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment), + NILFS_SEGMENTS_ATTR_LIST(clean_segments), + NILFS_SEGMENTS_ATTR_LIST(dirty_segments), + NILFS_SEGMENTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segments, dev); +NILFS_DEV_INT_GROUP_TYPE(segments, dev); +NILFS_DEV_INT_GROUP_FNS(segments, dev); + +/************************************************************************ + * NILFS segctor attrs * + ************************************************************************/ + +static ssize_t +nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t last_pseg; + + spin_lock(&nilfs->ns_last_segment_lock); + last_pseg = nilfs->ns_last_pseg; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)last_pseg); +} + +static ssize_t +nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 last_seq; + + spin_lock(&nilfs->ns_last_segment_lock); + last_seq = nilfs->ns_last_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq); +} + +static ssize_t +nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 seg_seq; + + down_read(&nilfs->ns_sem); + seg_seq = nilfs->ns_seg_seq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); +} + +static ssize_t +nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 segnum; + + down_read(&nilfs->ns_sem); + segnum = nilfs->ns_segnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); +} + +static ssize_t +nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nextnum; + + down_read(&nilfs->ns_sem); + nextnum = nilfs->ns_nextnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); +} + +static ssize_t +nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long pseg_offset; + + down_read(&nilfs->ns_sem); + pseg_offset = nilfs->ns_pseg_offset; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); +} + +static ssize_t +nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(ctime, buf); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(nongc_ctime, buf); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)nongc_ctime); +} + +static ssize_t +nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u32 ndirtyblks; + + down_read(&nilfs->ns_sem); + ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); +} + +static const char segctor_readme_str[] = + "The segctor group contains attributes that describe\n" + "segctor thread activity details.\n\n" + "(1) last_pseg_block\n" + "\tshow start block number of the latest segment.\n\n" + "(2) last_seg_sequence\n" + "\tshow sequence value of the latest segment.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n" + "(5) current_last_full_seg\n" + "\tshow index number of the latest full segment.\n\n" + "(6) next_full_seg\n" + "\tshow index number of the full segment index to be used next.\n\n" + "(7) next_pseg_offset\n" + "\tshow offset of next partial segment in the current full segment.\n\n" + "(8) next_checkpoint\n\tshow next checkpoint number.\n\n" + "(9) last_seg_write_time\n" + "\tshow write time of the last segment in human-readable format.\n\n" + "(10) last_seg_write_time_secs\n" + "\tshow write time of the last segment in seconds.\n\n" + "(11) last_nongc_write_time\n" + "\tshow write time of the last segment not for cleaner operation " + "in human-readable format.\n\n" + "(12) last_nongc_write_time_secs\n" + "\tshow write time of the last segment not for cleaner operation " + "in seconds.\n\n" + "(13) dirty_data_blocks_count\n" + "\tshow number of dirty data blocks.\n\n"; + +static ssize_t +nilfs_segctor_README_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, segctor_readme_str); +} + +NILFS_SEGCTOR_RO_ATTR(last_pseg_block); +NILFS_SEGCTOR_RO_ATTR(last_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint); +NILFS_SEGCTOR_RO_ATTR(current_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(current_last_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_pseg_offset); +NILFS_SEGCTOR_RO_ATTR(next_checkpoint); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count); +NILFS_SEGCTOR_RO_ATTR(README); + +static struct attribute *nilfs_segctor_attrs[] = { + NILFS_SEGCTOR_ATTR_LIST(last_pseg_block), + NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset), + NILFS_SEGCTOR_ATTR_LIST(next_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count), + NILFS_SEGCTOR_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segctor, dev); +NILFS_DEV_INT_GROUP_TYPE(segctor, dev); +NILFS_DEV_INT_GROUP_FNS(segctor, dev); + +/************************************************************************ + * NILFS superblock attrs * + ************************************************************************/ + +static ssize_t +nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(sbwtime, buf); +} + +static ssize_t +nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); +} + +static ssize_t +nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sbwcount; + + down_read(&nilfs->ns_sem); + sbwcount = nilfs->ns_sbwcount; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sb_update_freq; + + down_read(&nilfs->ns_sem); + sb_update_freq = nilfs->ns_sb_update_freq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + const char *buf, size_t count) +{ + unsigned val; + int err; + + err = kstrtouint(skip_spaces(buf), 0, &val); + if (err) { + printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", + err); + return err; + } + + if (val < NILFS_SB_FREQ) { + val = NILFS_SB_FREQ; + printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + } + + down_write(&nilfs->ns_sem); + nilfs->ns_sb_update_freq = val; + up_write(&nilfs->ns_sem); + + return count; +} + +static const char sb_readme_str[] = + "The superblock group contains attributes that describe\n" + "superblock's details.\n\n" + "(1) sb_write_time\n\tshow previous write time of super block " + "in human-readable format.\n\n" + "(2) sb_write_time_secs\n\tshow previous write time of super block " + "in seconds.\n\n" + "(3) sb_write_count\n\tshow write count of super block.\n\n" + "(4) sb_update_frequency\n" + "\tshow/set interval of periodical update of superblock (in seconds).\n\n" + "\tYou can set preferable frequency of superblock update by command:\n\n" + "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n"; + +static ssize_t +nilfs_superblock_README_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, sb_readme_str); +} + +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_count); +NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency); +NILFS_SUPERBLOCK_RO_ATTR(README); + +static struct attribute *nilfs_superblock_attrs[] = { + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count), + NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency), + NILFS_SUPERBLOCK_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(superblock, dev); +NILFS_DEV_INT_GROUP_TYPE(superblock, dev); +NILFS_DEV_INT_GROUP_FNS(superblock, dev); + +/************************************************************************ + * NILFS device attrs * + ************************************************************************/ + +static +ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u32 major = le32_to_cpu(sbp[0]->s_rev_level); + u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + + return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor); +} + +static +ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize); +} + +static +ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + + return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size); +} + +static +ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t free_blocks = 0; + + nilfs_count_free_blocks(nilfs, &free_blocks); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)free_blocks); +} + +static +ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid); +} + +static +ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", + sbp[0]->s_volume_name); +} + +static const char dev_readme_str[] = + "The <device> group contains attributes that describe file system\n" + "partition's details.\n\n" + "(1) revision\n\tshow NILFS file system revision.\n\n" + "(2) blocksize\n\tshow volume block size in bytes.\n\n" + "(3) device_size\n\tshow volume size in bytes.\n\n" + "(4) free_blocks\n\tshow count of free blocks on volume.\n\n" + "(5) uuid\n\tshow volume's UUID.\n\n" + "(6) volume_name\n\tshow volume's name.\n\n"; + +static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, dev_readme_str); +} + +NILFS_DEV_RO_ATTR(revision); +NILFS_DEV_RO_ATTR(blocksize); +NILFS_DEV_RO_ATTR(device_size); +NILFS_DEV_RO_ATTR(free_blocks); +NILFS_DEV_RO_ATTR(uuid); +NILFS_DEV_RO_ATTR(volume_name); +NILFS_DEV_RO_ATTR(README); + +static struct attribute *nilfs_dev_attrs[] = { + NILFS_DEV_ATTR_LIST(revision), + NILFS_DEV_ATTR_LIST(blocksize), + NILFS_DEV_ATTR_LIST(device_size), + NILFS_DEV_ATTR_LIST(free_blocks), + NILFS_DEV_ATTR_LIST(uuid), + NILFS_DEV_ATTR_LIST(volume_name), + NILFS_DEV_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_dev_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->show ? a->show(a, nilfs, buf) : 0; +} + +static ssize_t nilfs_dev_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->store ? a->store(a, nilfs, buf, len) : 0; +} + +static void nilfs_dev_attr_release(struct kobject *kobj) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + complete(&nilfs->ns_dev_kobj_unregister); +} + +static const struct sysfs_ops nilfs_dev_attr_ops = { + .show = nilfs_dev_attr_show, + .store = nilfs_dev_attr_store, +}; + +static struct kobj_type nilfs_dev_ktype = { + .default_attrs = nilfs_dev_attrs, + .sysfs_ops = &nilfs_dev_attr_ops, + .release = nilfs_dev_attr_release, +}; + +int nilfs_sysfs_create_device_group(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups); + int err; + + nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); + if (unlikely(!nilfs->ns_dev_subgroups)) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + goto failed_create_device_group; + } + + nilfs->ns_dev_kobj.kset = nilfs_kset; + init_completion(&nilfs->ns_dev_kobj_unregister); + err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, + "%s", sb->s_id); + if (err) + goto free_dev_subgroups; + + err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); + if (err) + goto cleanup_dev_kobject; + + err = nilfs_sysfs_create_checkpoints_group(nilfs); + if (err) + goto delete_mounted_snapshots_group; + + err = nilfs_sysfs_create_segments_group(nilfs); + if (err) + goto delete_checkpoints_group; + + err = nilfs_sysfs_create_superblock_group(nilfs); + if (err) + goto delete_segments_group; + + err = nilfs_sysfs_create_segctor_group(nilfs); + if (err) + goto delete_superblock_group; + + return 0; + +delete_superblock_group: + nilfs_sysfs_delete_superblock_group(nilfs); + +delete_segments_group: + nilfs_sysfs_delete_segments_group(nilfs); + +delete_checkpoints_group: + nilfs_sysfs_delete_checkpoints_group(nilfs); + +delete_mounted_snapshots_group: + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + +cleanup_dev_kobject: + kobject_del(&nilfs->ns_dev_kobj); + +free_dev_subgroups: + kfree(nilfs->ns_dev_subgroups); + +failed_create_device_group: + return err; +} + +void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) +{ + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + nilfs_sysfs_delete_checkpoints_group(nilfs); + nilfs_sysfs_delete_segments_group(nilfs); + nilfs_sysfs_delete_superblock_group(nilfs); + nilfs_sysfs_delete_segctor_group(nilfs); + kobject_del(&nilfs->ns_dev_kobj); + kfree(nilfs->ns_dev_subgroups); +} + +/************************************************************************ + * NILFS feature attrs * + ************************************************************************/ + +static ssize_t nilfs_feature_revision_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d.%d\n", + NILFS_CURRENT_REV, NILFS_MINOR_REV); +} + +static const char features_readme_str[] = + "The features group contains attributes that describe NILFS file\n" + "system driver features.\n\n" + "(1) revision\n\tshow current revision of NILFS file system driver.\n"; + +static ssize_t nilfs_feature_README_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, features_readme_str); +} + +NILFS_FEATURE_RO_ATTR(revision); +NILFS_FEATURE_RO_ATTR(README); + +static struct attribute *nilfs_feature_attrs[] = { + NILFS_FEATURE_ATTR_LIST(revision), + NILFS_FEATURE_ATTR_LIST(README), + NULL, +}; + +static const struct attribute_group nilfs_feature_attr_group = { + .name = "features", + .attrs = nilfs_feature_attrs, +}; + +int __init nilfs_sysfs_init(void) +{ + int err; + + nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); + if (!nilfs_kset) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", + err); + goto failed_sysfs_init; + } + + err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", + err); + goto cleanup_sysfs_init; + } + + return 0; + +cleanup_sysfs_init: + kset_unregister(nilfs_kset); + +failed_sysfs_init: + return err; +} + +void nilfs_sysfs_exit(void) +{ + sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + kset_unregister(nilfs_kset); +} diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h new file mode 100644 index 000000000000..677e3a1a8370 --- /dev/null +++ b/fs/nilfs2/sysfs.h @@ -0,0 +1,176 @@ +/* + * sysfs.h - sysfs support declarations. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#ifndef _NILFS_SYSFS_H +#define _NILFS_SYSFS_H + +#include <linux/sysfs.h> + +#define NILFS_ROOT_GROUP_NAME "nilfs2" + +/* + * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects + * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock + * @sg_superblock_kobj_unregister: completion state + * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor + * @sg_segctor_kobj_unregister: completion state + * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots + * @sg_mounted_snapshots_kobj_unregister: completion state + * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints + * @sg_checkpoints_kobj_unregister: completion state + * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments + * @sg_segments_kobj_unregister: completion state + */ +struct nilfs_sysfs_dev_subgroups { + /* /sys/fs/<nilfs>/<device>/superblock */ + struct kobject sg_superblock_kobj; + struct completion sg_superblock_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segctor */ + struct kobject sg_segctor_kobj; + struct completion sg_segctor_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots */ + struct kobject sg_mounted_snapshots_kobj; + struct completion sg_mounted_snapshots_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/checkpoints */ + struct kobject sg_checkpoints_kobj; + struct completion sg_checkpoints_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segments */ + struct kobject sg_segments_kobj; + struct completion sg_segments_kobj_unregister; +}; + +#define NILFS_COMMON_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct kobject *, struct attribute *, \ + char *); \ + ssize_t (*store)(struct kobject *, struct attribute *, \ + const char *, size_t); \ +}; + +NILFS_COMMON_ATTR_STRUCT(feature); + +#define NILFS_DEV_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + const char *, size_t); \ +}; + +NILFS_DEV_ATTR_STRUCT(dev); +NILFS_DEV_ATTR_STRUCT(segments); +NILFS_DEV_ATTR_STRUCT(mounted_snapshots); +NILFS_DEV_ATTR_STRUCT(checkpoints); +NILFS_DEV_ATTR_STRUCT(superblock); +NILFS_DEV_ATTR_STRUCT(segctor); + +#define NILFS_CP_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + const char *, size_t); \ +}; + +NILFS_CP_ATTR_STRUCT(snapshot); + +#define NILFS_ATTR(type, name, mode, show, store) \ + static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \ + __ATTR(name, mode, show, store) + +#define NILFS_INFO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, NULL, NULL) +#define NILFS_RO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL) +#define NILFS_RW_ATTR(type, name) \ + NILFS_ATTR(type, name, 0644, \ + nilfs_##type##_##name##_show, \ + nilfs_##type##_##name##_store) + +#define NILFS_FEATURE_INFO_ATTR(name) \ + NILFS_INFO_ATTR(feature, name) +#define NILFS_FEATURE_RO_ATTR(name) \ + NILFS_RO_ATTR(feature, name) +#define NILFS_FEATURE_RW_ATTR(name) \ + NILFS_RW_ATTR(feature, name) + +#define NILFS_DEV_INFO_ATTR(name) \ + NILFS_INFO_ATTR(dev, name) +#define NILFS_DEV_RO_ATTR(name) \ + NILFS_RO_ATTR(dev, name) +#define NILFS_DEV_RW_ATTR(name) \ + NILFS_RW_ATTR(dev, name) + +#define NILFS_SEGMENTS_RO_ATTR(name) \ + NILFS_RO_ATTR(segments, name) +#define NILFS_SEGMENTS_RW_ATTR(name) \ + NILFS_RW_ATTR(segs_info, name) + +#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \ + NILFS_RO_ATTR(mounted_snapshots, name) + +#define NILFS_CHECKPOINTS_RO_ATTR(name) \ + NILFS_RO_ATTR(checkpoints, name) +#define NILFS_CHECKPOINTS_RW_ATTR(name) \ + NILFS_RW_ATTR(checkpoints, name) + +#define NILFS_SNAPSHOT_INFO_ATTR(name) \ + NILFS_INFO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RO_ATTR(name) \ + NILFS_RO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RW_ATTR(name) \ + NILFS_RW_ATTR(snapshot, name) + +#define NILFS_SUPERBLOCK_RO_ATTR(name) \ + NILFS_RO_ATTR(superblock, name) +#define NILFS_SUPERBLOCK_RW_ATTR(name) \ + NILFS_RW_ATTR(superblock, name) + +#define NILFS_SEGCTOR_INFO_ATTR(name) \ + NILFS_INFO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RO_ATTR(name) \ + NILFS_RO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RW_ATTR(name) \ + NILFS_RW_ATTR(segctor, name) + +#define NILFS_FEATURE_ATTR_LIST(name) \ + (&nilfs_feature_attr_##name.attr) +#define NILFS_DEV_ATTR_LIST(name) \ + (&nilfs_dev_attr_##name.attr) +#define NILFS_SEGMENTS_ATTR_LIST(name) \ + (&nilfs_segments_attr_##name.attr) +#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \ + (&nilfs_mounted_snapshots_attr_##name.attr) +#define NILFS_CHECKPOINTS_ATTR_LIST(name) \ + (&nilfs_checkpoints_attr_##name.attr) +#define NILFS_SNAPSHOT_ATTR_LIST(name) \ + (&nilfs_snapshot_attr_##name.attr) +#define NILFS_SUPERBLOCK_ATTR_LIST(name) \ + (&nilfs_superblock_attr_##name.attr) +#define NILFS_SEGCTOR_ATTR_LIST(name) \ + (&nilfs_segctor_attr_##name.attr) + +#endif /* _NILFS_SYSFS_H */ diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8ba8229ba076..9da25fe9ea61 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_cptree = RB_ROOT; spin_lock_init(&nilfs->ns_cptree_lock); init_rwsem(&nilfs->ns_segctor_sem); + nilfs->ns_sb_update_freq = NILFS_SB_FREQ; return nilfs; } @@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { + nilfs_sysfs_delete_device_group(nilfs); brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } @@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; + err = nilfs_sysfs_create_device_group(sb); + if (err) + goto failed_sbh; + set_nilfs_init(nilfs); err = 0; out: @@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) { struct rb_node **p, *parent; struct nilfs_root *root, *new; + int err; root = nilfs_lookup_root(nilfs, cno); if (root) return root; - new = kmalloc(sizeof(*root), GFP_KERNEL); + new = kzalloc(sizeof(*root), GFP_KERNEL); if (!new) return NULL; @@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) spin_unlock(&nilfs->ns_cptree_lock); + err = nilfs_sysfs_create_snapshot_group(new); + if (err) { + kfree(new); + new = NULL; + } + return new; } @@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root) if (atomic_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; + nilfs_sysfs_delete_snapshot_group(root); + spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index de8cc53b4a5c..d01ead1bea9a 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -33,6 +33,7 @@ #include <linux/slab.h> struct nilfs_sc_info; +struct nilfs_sysfs_dev_subgroups; /* the_nilfs struct */ enum { @@ -54,6 +55,7 @@ enum { * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block * @ns_mount_state: file system state + * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds) * @ns_seg_seq: segment sequence counter * @ns_segnum: index number of the latest full segment. * @ns_nextnum: index number of the full segment index to be used next @@ -95,6 +97,9 @@ enum { * @ns_inode_size: size of on-disk inode * @ns_first_ino: first not-special inode number * @ns_crc_seed: seed value of CRC32 calculation + * @ns_dev_kobj: /sys/fs/<nilfs>/<device> + * @ns_dev_kobj_unregister: completion state + * @ns_dev_subgroups: <device> subgroups pointer */ struct the_nilfs { unsigned long ns_flags; @@ -114,6 +119,7 @@ struct the_nilfs { unsigned ns_sbwcount; unsigned ns_sbsize; unsigned ns_mount_state; + unsigned ns_sb_update_freq; /* * Following fields are dedicated to a writable FS-instance. @@ -188,6 +194,11 @@ struct the_nilfs { int ns_inode_size; int ns_first_ino; u32 ns_crc_seed; + + /* /sys/fs/<nilfs>/<device> */ + struct kobject ns_dev_kobj; + struct completion ns_dev_kobj_unregister; + struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; }; #define THE_NILFS_FNS(bit, name) \ @@ -232,6 +243,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * @ifile: inode file * @inodes_count: number of inodes * @blocks_count: number of blocks + * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> + * @snapshot_kobj_unregister: completion state for kernel object */ struct nilfs_root { __u64 cno; @@ -243,6 +256,10 @@ struct nilfs_root { atomic64_t inodes_count; atomic64_t blocks_count; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */ + struct kobject snapshot_kobj; + struct completion snapshot_kobj_unregister; }; /* Special checkpoint number */ @@ -254,7 +271,8 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); - return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; + return t < nilfs->ns_sbwtime || + t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; } static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index abc8cbcfe90e..caaaf9dfe353 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -346,13 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out; } - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - if (error) { - /* if we added, we must shoot */ - if (dn_mark == new_dn_mark) - destroy = 1; - goto out; - } + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index ee9cb3795c2b..30d3addfad75 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group, wait_event(group->fanotify_data.access_waitq, event->response || atomic_read(&group->fanotify_data.bypass_perm)); - if (!event->response) /* bypass_perm set */ + if (!event->response) { /* bypass_perm set */ + /* + * Event was canceled because group is being destroyed. Remove + * it from group's event list because we are responsible for + * freeing the permission event. + */ + fsnotify_remove_event(group, &event->fae.fse); return 0; + } /* userspace responded, convert to something usable */ switch (event->response) { @@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, return -ENOMEM; fsn_event = &event->fse; - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3fdc8a3e1134..c991616acca9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - return fsnotify_remove_notify_event(group); + return fsnotify_remove_first_event(group); } static int create_fd(struct fsnotify_group *group, @@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - client_fd = get_unused_fd(); + client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; @@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + /* + * There may be still new events arriving in the notification queue + * but since userspace cannot use fanotify fd anymore, no event can + * enter or leave access_list by now. + */ spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); @@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file) } spin_unlock(&group->fanotify_data.access_lock); + /* + * Since bypass_perm is set, newly queued events will not wait for + * access response. Wake up the already sleeping ones now. + * synchronize_srcu() in fsnotify_destroy_group() will wait for all + * processes sleeping in fanotify_handle_event() waiting for access + * response and thus also for all permission events to be freed. + */ wake_up(&group->fanotify_data.access_waitq); #endif diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 238a5930cb3c..9d7e2b9659cb 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) { struct { struct file_handle handle; - u8 pad[64]; + u8 pad[MAX_HANDLE_SZ]; } f; int size, ret, i; @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == 255) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 85e7d2b431d9..9c0898c4cfe1 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct vfsmount *mnt, int allow_dups); -/* final kfree of a group */ -extern void fsnotify_final_destroy_group(struct fsnotify_group *group); - /* vfsmount specific destruction of a mark */ extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ diff --git a/fs/notify/group.c b/fs/notify/group.c index ad1995980456..d16b62cb2854 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -31,7 +31,7 @@ /* * Final freeing of a group */ -void fsnotify_final_destroy_group(struct fsnotify_group *group) +static void fsnotify_final_destroy_group(struct fsnotify_group *group) { if (group->ops->free_group_priv) group->ops->free_group_priv(group); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be65b7b..9ce062218de9 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 43ab1e1a07a2..7d888d77d59a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group, if (len) strcpy(event->name, file_name); - ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); + if (group->inotify_data.user) { + atomic_dec(&group->inotify_data.user->inotify_devs); + free_uid(group->inotify_data.user); + } } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cc423a30a0c8..daf76652fe58 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - event = fsnotify_peek_notify_event(group); + event = fsnotify_peek_first_event(group); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - fsnotify_remove_notify_event(group); + fsnotify_remove_first_event(group); return event; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1e58402171a5..a95d8e037aeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - + /* If the event is still queued, we have a problem... */ + WARN_ON(!list_empty(&event->list)); group->ops->free_event(event); } @@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * added to the queue, 1 if the event was merged with some other queued event, * 2 if the queue of events has overflown. */ -int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -125,10 +126,25 @@ queue: } /* + * Remove @event from group's notification queue. It is the responsibility of + * the caller to destroy the event. + */ +void fsnotify_remove_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + mutex_lock(&group->notification_mutex); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + group->q_len--; + } + mutex_unlock(&group->notification_mutex); +} + +/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; @@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group struct fsnotify_event, list); /* * We need to init list head for the case of overflow event so that - * check in fsnotify_add_notify_events() works + * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; @@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group } /* - * This will not remove the event, that must be done with fsnotify_remove_notify_event() + * This will not remove the event, that must be done with + * fsnotify_remove_first_event() */ -struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { BUG_ON(!mutex_is_locked(&group->notification_mutex)); @@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { - event = fsnotify_remove_notify_event(group); + event = fsnotify_remove_first_event(group); fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8704b5..ac851e8376b1 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index dd6103cc93c1..825a54e8f490 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb, /* If 1, output debug messages, and if 0, don't. */ int debug_msgs = 0; -void __ntfs_debug (const char *file, int line, const char *function, +void __ntfs_debug(const char *file, int line, const char *function, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5c9e2c81cb11..643faa44f22b 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * ntfs_attr_extend_initialized - extend the initialized size of an attribute * @ni: ntfs inode of the attribute to extend * @new_init_size: requested new initialized size in bytes - * @cached_page: store any allocated but unused page here - * @lru_pvec: lru-buffering pagevec of the caller * * Extend the initialized size of an attribute described by the ntfs inode @ni * to @new_init_size bytes. This involves zeroing any non-sparse space between @@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * @nr_pages: number of page cache pages to obtain * @pages: array of pages in which to return the obtained page cache pages * @cached_page: allocated but as yet unused page - * @lru_pvec: lru-buffering pagevec of caller * * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. @@ -413,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, BUG_ON(!nr_pages); err = nr = 0; do { - pages[nr] = find_lock_page(mapping, index); + pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | + FGP_ACCESSED); if (!pages[nr]) { if (!*cached_page) { *cached_page = page_cache_alloc(mapping); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 6c3296e546c3..9e1e112074fb 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void) } MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9d8fcf2f3b94..a93bf9892256 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4961,6 +4961,15 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu: split at cpos %u lost record.", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4a231a166cf8..1ef547e49373 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + page = find_or_create_page(mapping, 0, GFP_NOFS); if (!page) { + ocfs2_commit_trans(osb, handle); ret = -ENOMEM; mlog_errno(ret); goto out; @@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, wc->w_pages[0] = wc->w_target_page = page; wc->w_num_pages = 1; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 73039295d0d1..d13385448168 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); +int o2hb_check_node_heartbeating_no_sem(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long flags; + + spin_lock_irqsave(&o2hb_live_lock, flags); + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); + int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 00ad8e8fea51..3ef5137dc362 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map, void o2hb_exit(void); int o2hb_init(void); int o2hb_check_node_heartbeating(u8 node_num); +int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 73ba81928bce..27d1242c8383 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = { static int nst_fop_open(struct inode *inode, struct file *file) { struct o2net_send_tracking *dummy_nst; - struct seq_file *seq; - int ret; - dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); - if (dummy_nst == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_nst->st_task = NULL; - - ret = seq_open(file, &nst_seq_ops); - if (ret) - goto out; - - seq = file->private_data; - seq->private = dummy_nst; + dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); + if (!dummy_nst) + return -ENOMEM; o2net_debug_add_nst(dummy_nst); - dummy_nst = NULL; - -out: - kfree(dummy_nst); - return ret; + return 0; } static int nst_fop_release(struct inode *inode, struct file *file) @@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = { .show = sc_seq_show, }; -static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) +static int sc_common_open(struct file *file, int ctxt) { + struct o2net_sock_debug *sd; struct o2net_sock_container *dummy_sc; - struct seq_file *seq; - int ret; - dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); - if (dummy_sc == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_sc->sc_page = NULL; + dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); + if (!dummy_sc) + return -ENOMEM; - ret = seq_open(file, &sc_seq_ops); - if (ret) - goto out; + sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); + if (!sd) { + kfree(dummy_sc); + return -ENOMEM; + } - seq = file->private_data; - seq->private = sd; + sd->dbg_ctxt = ctxt; sd->dbg_sock = dummy_sc; - o2net_debug_add_sc(dummy_sc); - dummy_sc = NULL; + o2net_debug_add_sc(dummy_sc); -out: - kfree(dummy_sc); - return ret; + return 0; } static int sc_fop_release(struct inode *inode, struct file *file) @@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) static int stats_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_STATS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_STATS); } static const struct file_operations stats_seq_fops = { @@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = { static int sc_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_CONTAINERS); } static const struct file_operations sc_seq_fops = { diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..62e8ec619b4c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 681691bc233a..97de0fbd9f78 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (nn->nn_persistent_error || nn->nn_sc_valid) wake_up(&nn->nn_sc_wq); - if (!was_err && nn->nn_persistent_error) { + if (was_valid && !was_err && nn->nn_persistent_error) { o2quo_conn_err(o2net_num_from_nn(nn)); queue_delayed_work(o2net_wq, &nn->nn_still_up, msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); @@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); @@ -1580,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; + unsigned int noio_flag; + /* + * sock_create allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; @@ -1650,6 +1679,12 @@ static void o2net_start_connect(struct work_struct *work) goto out; } + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + o2net_register_callbacks(sc->sc_sock->sk, sc); spin_lock(&nn->nn_lock); @@ -1683,6 +1718,7 @@ out: if (mynode) o2nm_node_put(mynode); + memalloc_noio_restore(noio_flag); return; } @@ -1694,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { printk(KERN_NOTICE "o2net: No connection established with " - "node %u after %u.%u seconds, giving up.\n", + "node %u after %u.%u seconds, check network and" + " cluster configuration.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); @@ -1808,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; struct o2net_node *nn; + unsigned int noio_flag; + + /* + * sock_create_lite allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); BUG_ON(sock == NULL); *more = 0; @@ -1831,6 +1877,12 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + slen = sizeof(sin); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, &slen, 1); @@ -1918,6 +1970,8 @@ out: o2nm_node_put(local_node); if (sc) sc_put(sc); + + memalloc_noio_restore(noio_flag); return ret; } @@ -2113,17 +2167,13 @@ int o2net_init(void) o2quo_init(); if (o2net_debugfs_init()) - return -ENOMEM; + goto out; o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); - return -ENOMEM; - } + if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) + goto out; o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2148,6 +2198,14 @@ int o2net_init(void) } return 0; + +out: + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + + o2quo_exit(); + return -ENOMEM; } void o2net_exit(void) diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..c571e849fda4 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 18f13c2e4a10..149eb556b8c6 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = { static int debug_lockres_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - int ret = -ENOMEM; - struct seq_file *seq; - struct debug_lockres *dl = NULL; + struct debug_lockres *dl; + void *buf; - dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); - if (!dl) { - mlog_errno(ret); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) goto bail; - } - dl->dl_len = PAGE_SIZE; - dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); - if (!dl->dl_buf) { - mlog_errno(ret); - goto bail; - } + dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl)); + if (!dl) + goto bailfree; - ret = seq_open(file, &debug_lockres_ops); - if (ret) { - mlog_errno(ret); - goto bail; - } - - seq = file->private_data; - seq->private = dl; + dl->dl_len = PAGE_SIZE; + dl->dl_buf = buf; dlm_grab(dlm); dl->dl_ctxt = dlm; return 0; + +bailfree: + kfree(buf); bail: - if (dl) - kfree(dl->dl_buf); - kfree(dl); - return ret; + mlog_errno(-ENOMEM); + return -ENOMEM; } static int debug_lockres_release(struct inode *inode, struct file *file) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 39efc5057a36..02d315fef432 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to back off and try again. This gives heartbeat a chance * to catch up. */ - if (!o2hb_check_node_heartbeating(query->node_idx)) { + if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) { mlog(0, "node %u is not in our live map yet\n", query->node_idx); @@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } @@ -1976,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); if (!dlm) { - mlog_errno(-ENOMEM); + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->name = kstrdup(domain, GFP_KERNEL); if (dlm->name == NULL) { - mlog_errno(-ENOMEM); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { - mlog_errno(-ENOMEM); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2003,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->master_hash = (struct hlist_head **) dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->master_hash) { - mlog_errno(-ENOMEM); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2018,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->node_num = o2nm_this_node(); ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) { - dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + if (ret < 0) goto leave; - } spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2086,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, atomic_read(&dlm->dlm_refs.refcount)); leave: + if (ret < 0 && dlm) { + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, + DLM_HASH_PAGES); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, + DLM_HASH_PAGES); + + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + } return dlm; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 82abf0cc9a12..215e41abf101 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; error: - if (res && res->lockname.name) - kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); - if (res) kmem_cache_free(dlm_lockres_cache, res); return NULL; @@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, clear_bit(bit, res->refmap); } - -void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, +static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - assert_spin_locked(&res->spinlock); - res->inflight_locks++; mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, @@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, __builtin_return_address(0)); } +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + __dlm_lockres_grab_inflight_ref(dlm, res); +} + void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -894,10 +895,8 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); - /* Grab inflight ref to pin the resource */ - spin_lock(&res->spinlock); - dlm_lockres_grab_inflight_ref(dlm, res); - spin_unlock(&res->spinlock); + /* since this lockres is new it doesn't not require the spinlock */ + __dlm_lockres_grab_inflight_ref(dlm, res); /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last @@ -2037,6 +2036,10 @@ kill: "and killing the other node now! This node is OK and can continue.\n"); __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); + spin_lock(&dlm->master_lock); + if (mle) + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); *ret_data = (void *)res; dlm_put(dlm); @@ -2405,6 +2408,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, if (res->state & DLM_LOCK_RES_MIGRATING) return 0; + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + if (res->owner != dlm->node_num) return 0; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 45067faf5695..3365839d2971 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } else __dlm_lockres_grab_inflight_worker(dlm, res); - } else /* put.. incase we are not the master */ + spin_unlock(&res->spinlock); + } else { + /* put.. incase we are not the master */ + spin_unlock(&res->spinlock); dlm_lockres_put(res); - spin_unlock(&res->spinlock); + } } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 52cfe99ae056..21262f2b1654 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) { - int ret; struct ocfs2_dlm_seq_priv *priv; - struct seq_file *seq; struct ocfs2_super *osb; - priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv)); if (!priv) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + mlog_errno(-ENOMEM); + return -ENOMEM; } + osb = inode->i_private; ocfs2_get_dlm_debug(osb->osb_dlm_debug); priv->p_dlm_debug = osb->osb_dlm_debug; INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); - ret = seq_open(file, &ocfs2_dlm_seq_ops); - if (ret) { - kfree(priv); - mlog_errno(ret); - goto out; - } - - seq = file->private_data; - seq->private = priv; - ocfs2_add_lockres_tracking(&priv->p_iter_res, priv->p_dlm_debug); -out: - return ret; + return 0; } static const struct file_operations ocfs2_dlm_debug_fops = { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2930e231f3f9..324dc93ac896 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, struct address_space *mapping = inode->i_mapping; struct page *page; unsigned long index = abs_from >> PAGE_CACHE_SHIFT; - handle_t *handle = NULL; + handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); - goto out; + goto out_commit_trans; } /* Get the offsets within the page that we want to zero */ @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out_unlock; } - if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode, - di_bh); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - break; - } - } /* must not update i_size! */ ret = block_commit_write(page, block_start + 1, @@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; if (handle) { - /* - * fs-writeback will release the dirty pages without page lock - * whose offset are over inode size, the release happens at - * block_write_full_page(). - */ - i_size_write(inode, abs_to); - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - di->i_mtime_nsec = di->i_ctime_nsec; ocfs2_journal_dirty(handle, di_bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); } out_unlock: unlock_page(page); page_cache_release(page); +out_commit_trans: + if (handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } @@ -1253,7 +1252,7 @@ bail: brelse(bh); /* Release quota pointers in case we acquired them */ - for (qtype = 0; qtype < MAXQUOTAS; qtype++) + for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index a6c991c0fc98..a9b76de46047 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) { int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; - return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); + return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits; } /* Validate that a bh contains a valid inode */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 6f66b3751ace..53e6c40ed4c6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -35,9 +35,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -146,136 +145,105 @@ bail: static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); - - return status; + return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); - - return status; + return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, @@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; @@ -727,23 +690,17 @@ out_err: static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 599eb4c4c8be..74caffeeee1d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, * 'vict_blkno' was out of the valid range. */ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || - (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << bits_per_unit))) { ret = -EINVAL; goto out; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index f266d67df3c6..1eae330193a6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -17,6 +17,9 @@ #include "ocfs2.h" +/* Number of quota types we support */ +#define OCFS2_MAXQUOTAS 2 + /* * In-memory structures */ @@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk { }; struct ocfs2_quota_recovery { - struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ + struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */ }; /* In-memory structure with quota header information */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b990a62cff50..c93d67220887 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) int ocfs2_global_read_info(struct super_block *sb, int type) { struct inode *gqinode = NULL; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2001862bf2b1..10b653930ee2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, /* Check whether we understand format of quota files */ static int ocfs2_local_check_quota_file(struct super_block *sb, int type) { - unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; - unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; - unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; - unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct buffer_head *bh = NULL; struct inode *linode = sb_dqopt(sb)->files[type]; struct inode *ginode = NULL; @@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) { int type; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) free_recovery_list(&(rec->r_list[type])); kfree(rec); } @@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); if (!rec) return NULL; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) INIT_LIST_HEAD(&(rec->r_list[type])); return rec; } @@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num) { - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct inode *lqinode; @@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( return ERR_PTR(-ENOMEM); /* First init... */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; /* At this point, journal of the slot is already replayed so @@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct ocfs2_quota_recovery *rec, int slot_num) { - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; @@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, "slot %u\n", osb->dev_str, slot_num); mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; trace_ocfs2_finish_quota_recovery(slot_num); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 636aab69ead5..d81f6e2a97f5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c151cccc..a88b2a4fcc85 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 13a8537d8e8b..720aa389e0ea 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file) */ ocfs2_control_this_node = -1; running_proto.pv_major = 0; - running_proto.pv_major = 0; + running_proto.pv_minor = 0; } out: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ddb662b32447..93c85bc745e1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) { int type; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; int status = 0; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) @@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) static int ocfs2_enable_quotas(struct ocfs2_super *osb) { - struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { + LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; int status; int type; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; inode[type] = ocfs2_get_system_file_inode(osb, ino[type], @@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) goto out_quota_off; } - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); return 0; out_quota_off: ocfs2_disable_quotas(osb); - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); mlog_errno(status); return status; @@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* We mostly ignore errors in this function because there's not much * we can do when we see them */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; /* Cancel periodic syncing before we grab dqonoff_mutex */ @@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Handle quota on quotactl */ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) { - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) return -EINVAL; @@ -2532,6 +2536,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); + kfree(osb->vol_label); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ec58c7659183..ba8819702c56 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -321,7 +321,7 @@ static int omfs_get_imap(struct super_block *sb) goto out; sbi->s_imap_size = array_size; - sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); + sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; diff --git a/fs/pnode.c b/fs/pnode.c index 302bf22c4a30..aae331a5d03b 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt) * other children */ if (child && list_empty(&child->mnt_mounts)) { + list_del_init(&child->mnt_child); hlist_del_init_rcu(&child->mnt_hash); hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); } diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 239493ec718e..7151ea428041 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -23,6 +23,7 @@ proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o proc-y += self.o +proc-y += thread_self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 3e1290b0492e..cd3653e4f35c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,13 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); - /* Temporary variable needed for gcc-2.96 */ - /* convert timespec -> nsec*/ - start_time = - (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC - + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(start_time); + start_time = nsec_to_clock_t(task->real_start_time); seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); seq_put_decimal_ll(m, ' ', ppid); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2d696b0c93bf..950100e326a1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -105,7 +105,7 @@ */ struct pid_entry { - char *name; + const char *name; int len; umode_t mode; const struct inode_operations *iop; @@ -130,10 +130,6 @@ struct pid_entry { { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) -#define INF(NAME, MODE, read) \ - NOD(NAME, (S_IFREG|(MODE)), \ - NULL, &proc_info_file_operations, \ - { .proc_read = read } ) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ @@ -200,27 +196,32 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char *buffer) +static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return get_cmdline(task, buffer, PAGE_SIZE); + /* + * Rely on struct seq_operations::show() being called once + * per internal buffer allocation. See single_open(), traverse(). + */ + BUG_ON(m->size < PAGE_SIZE); + m->count += get_cmdline(task, m->buf, PAGE_SIZE); + return 0; } -static int proc_pid_auxv(struct task_struct *task, char *buffer) +static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); - int res = PTR_ERR(mm); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ - res = nwords * sizeof(mm->saved_auxv[0]); - if (res > PAGE_SIZE) - res = PAGE_SIZE; - memcpy(buffer, mm->saved_auxv, res); + seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); mmput(mm); - } - return res; + return 0; + } else + return PTR_ERR(mm); } @@ -229,7 +230,8 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol. If that fails, simply return the address. */ -static int proc_pid_wchan(struct task_struct *task, char *buffer) +static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; @@ -240,9 +242,9 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; else - return sprintf(buffer, "%lu", wchan); + return seq_printf(m, "%lu", wchan); else - return sprintf(buffer, "%s", symname); + return seq_printf(m, "%s", symname); } #endif /* CONFIG_KALLSYMS */ @@ -304,9 +306,10 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, /* * Provides /proc/PID/schedstat */ -static int proc_pid_schedstat(struct task_struct *task, char *buffer) +static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return sprintf(buffer, "%llu %llu %lu\n", + return seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -373,38 +376,8 @@ static const struct file_operations proc_lstats_operations = { #endif -#ifdef CONFIG_CGROUPS -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -static const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -#ifdef CONFIG_PROC_PID_CPUSET - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -static const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -static int proc_oom_score(struct task_struct *task, char *buffer) +static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; @@ -414,12 +387,12 @@ static int proc_oom_score(struct task_struct *task, char *buffer) points = oom_badness(task, NULL, NULL, totalpages) * 1000 / totalpages; read_unlock(&tasklist_lock); - return sprintf(buffer, "%lu\n", points); + return seq_printf(m, "%lu\n", points); } struct limit_names { - char *name; - char *unit; + const char *name; + const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { @@ -442,12 +415,11 @@ static const struct limit_names lnames[RLIM_NLIMITS] = { }; /* Display limits for a process */ -static int proc_pid_limits(struct task_struct *task, char *buffer) +static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned int i; - int count = 0; unsigned long flags; - char *bufptr = buffer; struct rlimit rlim[RLIM_NLIMITS]; @@ -459,35 +431,34 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) /* * print the file header */ - count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + seq_printf(m, "%-25s %-20s %-20s %-10s\n", "Limit", "Soft Limit", "Hard Limit", "Units"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-25s %-20s ", + seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else - count += sprintf(&bufptr[count], "%-25s %-20lu ", + seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + seq_printf(m, "%-20s ", "unlimited"); else - count += sprintf(&bufptr[count], "%-20lu ", - rlim[i].rlim_max); + seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) - count += sprintf(&bufptr[count], "%-10s\n", - lnames[i].unit); + seq_printf(m, "%-10s\n", lnames[i].unit); else - count += sprintf(&bufptr[count], "\n"); + seq_putc(m, '\n'); } - return count; + return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -static int proc_pid_syscall(struct task_struct *task, char *buffer) +static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { long nr; unsigned long args[6], sp, pc; @@ -496,11 +467,11 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) return res; if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) - res = sprintf(buffer, "running\n"); + seq_puts(m, "running\n"); else if (nr < 0) - res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); else - res = sprintf(buffer, + seq_printf(m, "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", nr, args[0], args[1], args[2], args[3], args[4], args[5], @@ -598,43 +569,6 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ - -static ssize_t proc_info_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - struct inode * inode = file_inode(file); - unsigned long page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); - - length = -ESRCH; - if (!task) - goto out_no_task; - - if (count > PROC_BLOCK_SIZE) - count = PROC_BLOCK_SIZE; - - length = -ENOMEM; - if (!(page = __get_free_page(GFP_TEMPORARY))) - goto out; - - length = PROC_I(inode)->op.proc_read(task, (char*)page); - - if (length >= 0) - length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); - free_page(page); -out: - put_task_struct(task); -out_no_task: - return length; -} - -static const struct file_operations proc_info_file_operations = { - .read = proc_info_read, - .llseek = generic_file_llseek, -}; - static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; @@ -667,29 +601,35 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; -static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct task_struct *task = get_proc_task(inode); + struct mm_struct *mm = ERR_PTR(-ESRCH); - if (!task) - return -ESRCH; + if (task) { + mm = mm_access(task, mode); + put_task_struct(task); - mm = mm_access(task, mode); - put_task_struct(task); + if (!IS_ERR_OR_NULL(mm)) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + } + + return mm; +} + +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) +{ + struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR(mm)) return PTR_ERR(mm); - if (mm) { - /* ensure this mm_struct can't be freed */ - atomic_inc(&mm->mm_count); - /* but do not pin its memory */ - mmput(mm); - } - file->private_data = mm; - return 0; } @@ -2056,7 +1996,7 @@ static int show_timer(struct seq_file *m, void *v) struct k_itimer *timer; struct timers_private *tp = m->private; int notify; - static char *nstr[] = { + static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", @@ -2392,7 +2332,7 @@ static const struct file_operations proc_coredump_filter_operations = { #endif #ifdef CONFIG_TASK_IO_ACCOUNTING -static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; @@ -2416,7 +2356,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - result = sprintf(buffer, + result = seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2436,20 +2376,22 @@ out_unlock: return result; } -static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 0); + return do_io_accounting(task, m, 0); } -static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 1); + return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@ -2557,10 +2499,10 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -2569,9 +2511,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@ -2594,24 +2536,24 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@ -2625,10 +2567,10 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tgid_io_accounting), + ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@ -2780,12 +2722,12 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = 0; + int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - tgid = name_to_int(dentry); + tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; @@ -2847,7 +2789,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) @@ -2859,14 +2801,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; - if (pos == TGID_OFFSET - 1) { + if (pos == TGID_OFFSET - 2) { struct inode *inode = ns->proc_self->d_inode; if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; - iter.tgid = 0; - } else { - iter.tgid = pos - TGID_OFFSET; + ctx->pos = pos = pos + 1; } + if (pos == TGID_OFFSET - 1) { + struct inode *inode = ns->proc_thread_self->d_inode; + if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; @@ -2895,19 +2842,22 @@ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), @@ -2932,24 +2882,24 @@ static const struct pid_entry tid_base_stuff[] = { DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@ -2960,10 +2910,10 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tid_io_accounting), + ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@ -3033,7 +2983,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!leader) goto out_no_task; - tid = name_to_int(dentry); + tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 0788d093f5d8..955bb55fab8c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -206,7 +206,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; - unsigned fd = name_to_int(dentry); + unsigned fd = name_to_int(&dentry->d_name); if (!task) goto out_no_task; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b7f268eb5f45..317b72641ebf 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -27,7 +27,7 @@ #include "internal.h" -DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, nlink_t nlink) { struct proc_dir_entry *ent = NULL; - const char *fn = name; - unsigned int len; - - /* make sure name is valid */ - if (!name || !strlen(name)) - goto out; + const char *fn; + struct qstr qstr; if (xlate_proc_name(name, parent, &fn) != 0) goto out; + qstr.name = fn; + qstr.len = strlen(fn); + if (qstr.len == 0 || qstr.len >= 256) { + WARN(1, "name len %u\n", qstr.len); + return NULL; + } + if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { + WARN(1, "create '/proc/%s' by hand\n", qstr.name); + return NULL; + } - /* At this point there must not be any '/' characters beyond *fn */ - if (strchr(fn, '/')) - goto out; - - len = strlen(fn); - - ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) goto out; - memcpy(ent->name, fn, len + 1); - ent->namelen = len; + memcpy(ent->name, fn, qstr.len + 1); + ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 0adbc02d60e3..333080d7a671 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) int proc_fill_super(struct super_block *s) { struct inode *root_inode; + int ret; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; @@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s) return -ENOMEM; } - return proc_setup_self(s); + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3ab6d14e71c5..aa7a0ee182e1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,7 +52,6 @@ struct proc_dir_entry { union proc_op { int (*proc_get_link)(struct dentry *, struct path *); - int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); @@ -112,10 +111,10 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline unsigned name_to_int(struct dentry *dentry) +static inline unsigned name_to_int(const struct qstr *qstr) { - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; + const char *name = qstr->name; + int len = qstr->len; unsigned n = 0; if (len > 1 && *name == '0') @@ -178,8 +177,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i /* * generic.c */ -extern spinlock_t proc_subdir_lock; - extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, struct dentry *); @@ -234,6 +231,12 @@ static inline int proc_net_init(void) { return 0; } extern int proc_setup_self(struct super_block *); /* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL @@ -265,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *); * task_[no]mmu.c */ struct proc_maps_private { - struct pid *pid; + struct inode *inode; struct task_struct *task; + struct mm_struct *mm; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -275,6 +279,8 @@ struct proc_maps_private { #endif }; +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 39e6ef32f0bd..91a4e6426321 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; - end = ALIGN(end, PAGE_SIZE); + end = PAGE_ALIGN(end); /* overlap check (because we have to align page */ list_for_each_entry(tmp, head, list) { if (tmp->type != KCORE_VMEMMAP) @@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void) struct kcore_list kcore_modules; static void __init add_modules_range(void) { - kclist_add(&kcore_modules, (void *)MODULES_VADDR, + if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_END - MODULES_VADDR, KCORE_VMALLOC); + } } #else static void __init add_modules_range(void) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0b1aa3..aa1eee06420f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55275d9..1e3187da1fed 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page) if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + if (PageBalloon(page)) + u |= 1 << KPF_BALLOON; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4677bb7dc7c2..a63af3e0a612 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir) rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { - ns = task_nsproxy(task); + task_lock(task); + ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); + task_unlock(task); } rcu_read_unlock(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 71290463a1d3..f92d5dd578a4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -632,7 +632,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, ctl_table *table, +static int scan(struct ctl_table_header *head, struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index cb761f010300..15f327bed8c6 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -18,7 +18,7 @@ /* * The /proc/tty directory inodes... */ -static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; +static struct proc_dir_entry *proc_tty_driver; /* * This is the handler for /proc/tty/drivers @@ -176,7 +176,7 @@ void __init proc_tty_init(void) { if (!proc_mkdir("tty", NULL)) return; - proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); + proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ /* * /proc/tty/driver/serial reveals the exact character counts for * serial links which is just too easy to abuse for inferring diff --git a/fs/proc/root.c b/fs/proc/root.c index 5dbadecb234d..094e44d4a6be 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb) ns = (struct pid_namespace *)sb->s_fs_info; if (ns->proc_self) dput(ns->proc_self); + if (ns->proc_thread_self) + dput(ns->proc_thread_self); kill_anon_super(sb); put_pid_ns(ns); } @@ -170,6 +172,7 @@ void __init proc_root_init(void) return; proc_self_init(); + proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -199,10 +202,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dir, dentry, flags)) return NULL; - return proc_pid_lookup(dir, dentry, flags); + return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cfa63ee92c96..b7a7dc963a35 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm, #ifdef CONFIG_NUMA /* - * These functions are for numa_maps but called in generic **maps seq_file - * ->start(), ->stop() ops. - * - * numa_maps scans all vmas under mmap_sem and checks their mempolicy. - * Each mempolicy object is controlled by reference counting. The problem here - * is how to avoid accessing dead mempolicy object. - * - * Because we're holding mmap_sem while reading seq_file, it's safe to access - * each vma's mempolicy, no vma objects will never drop refs to mempolicy. - * - * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy - * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). - * So, without task_lock(), we cannot trust get_vma_policy() because we cannot - * gurantee the task never exits under us. But taking task_lock() around - * get_vma_plicy() causes lock order problem. - * - * To access task->mempolicy without lock, we hold a reference count of an - * object pointed by task->mempolicy and remember it. This will guarantee - * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); - priv->task_mempolicy = task->mempolicy; + priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } @@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +static void vma_stop(struct proc_maps_private *priv) { - if (vma && vma != priv->tail_vma) { - struct mm_struct *mm = vma->vm_mm; - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); - } + struct mm_struct *mm = priv->mm; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); +} + +static struct vm_area_struct * +m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma == priv->tail_vma) + return NULL; + return vma->vm_next ?: priv->tail_vma; } -static void *m_start(struct seq_file *m, loff_t *pos) +static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) +{ + if (m->count < m->size) /* vma is copied successfully */ + m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma = NULL; - loff_t l = *pos; - - /* Clear the per syscall fields in priv */ - priv->task = NULL; - priv->tail_vma = NULL; - - /* - * We remember last_addr rather than next_addr to hit with - * vmacache most of the time. We have zero last_addr at - * the beginning and also after lseek. We will have -1 last_addr - * after the end of the vmas. - */ + struct vm_area_struct *vma; + unsigned int pos = *ppos; + /* See m_cache_vma(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_access(priv->task, PTRACE_MODE_READ); - if (!mm || IS_ERR(mm)) - return mm; - down_read(&mm->mmap_sem); + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; - tail_vma = get_gate_vma(priv->task->mm); - priv->tail_vma = tail_vma; + down_read(&mm->mmap_sem); hold_task_mempolicy(priv); - /* Start with last addr hint */ - vma = find_vma(mm, last_addr); - if (last_addr && vma) { - vma = vma->vm_next; - goto out; + priv->tail_vma = get_gate_vma(mm); + + if (last_addr) { + vma = find_vma(mm, last_addr); + if (vma && (vma = m_next_vma(priv, vma))) + return vma; } - /* - * Check the vma index is within the range and do - * sequential scan until m_index. - */ - vma = NULL; - if ((unsigned long)l < mm->map_count) { - vma = mm->mmap; - while (l-- && vma) + m->version = 0; + if (pos < mm->map_count) { + for (vma = mm->mmap; pos; pos--) { + m->version = vma->vm_start; vma = vma->vm_next; - goto out; + } + return vma; } - if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ + /* we do not bother to update m->version in this case */ + if (pos == mm->map_count && priv->tail_vma) + return priv->tail_vma; -out: - if (vma) - return vma; - - release_task_mempolicy(priv); - /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; - up_read(&mm->mmap_sem); - mmput(mm); - return tail_vma; + vma_stop(priv); + return NULL; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = priv->tail_vma; + struct vm_area_struct *next; (*pos)++; - if (vma && (vma != tail_vma) && vma->vm_next) - return vma->vm_next; - vma_stop(priv, vma); - return (vma != tail_vma)? tail_vma: NULL; + next = m_next_vma(priv, v); + if (!next) + vma_stop(priv); + return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - if (!IS_ERR(vma)) - vma_stop(priv, vma); - if (priv->task) + if (!IS_ERR_OR_NULL(v)) + vma_stop(priv); + if (priv->task) { put_task_struct(priv->task); + priv->task = NULL; + } +} + +static int proc_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops, int psize) +{ + struct proc_maps_private *priv = __seq_open_private(file, ops, psize); + + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + +static int proc_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { - struct proc_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } + return proc_maps_open(inode, file, ops, + sizeof(struct proc_maps_private)); +} + +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); } + rcu_read_unlock(); + return ret; } @@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; @@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = vm_is_stack(task, vma, is_pid); - + tid = pid_of_stack(priv, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or @@ -349,15 +359,8 @@ done: static int show_map(struct seq_file *m, void *v, int is_pid) { - struct vm_area_struct *vma = v; - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - show_map_vma(m, vma, is_pid); - - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task->mm)) - ? vma->vm_start : 0; + show_map_vma(m, v, is_pid); + m_cache_vma(m, v); return 0; } @@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_maps_operations = { .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; /* @@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) static int show_smap(struct seq_file *m, void *v, int is_pid) { - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { @@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.nonlinear >> 10); show_smap_vma_flags(m, vma); - - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task->mm)) - ? vma->vm_start : 0; + m_cache_vma(m, vma); return 0; } @@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_smaps_operations = { .open = tid_smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; /* @@ -925,15 +923,39 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - unsigned long addr; + unsigned long addr = start; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); - if (err) + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + /* End of address space hole, which we mark as non-present. */ + unsigned long hole_end; + + if (vma) + hole_end = min(end, vma->vm_start); + else + hole_end = end; + + for (; addr < hole_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } + + if (!vma) break; + + /* Addresses in the VMA. */ + if (vma->vm_flags & VM_SOFTDIRTY) + pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } } +out: return err; } @@ -1005,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -1019,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; + pagemap_entry_t pme; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; @@ -1033,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; - for (; addr != end; addr += PAGE_SIZE) { - int flags2; - - /* check to see if we've left 'vma' behind - * and need a new, higher one */ - if (vma && (addr >= vma->vm_end)) { - vma = find_vma(walk->mm, addr); - if (vma && (vma->vm_flags & VM_SOFTDIRTY)) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; - pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); + + while (1) { + /* End of address space hole, which we mark as non-present. */ + unsigned long hole_end; + + if (vma) + hole_end = min(end, vma->vm_start); + else + hole_end = end; + + for (; addr < hole_end; addr += PAGE_SIZE) { + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; } - /* check that 'vma' actually covers this address, - * and that it isn't a huge page vma */ - if (vma && (vma->vm_start <= addr) && - !is_vm_hugetlb_page(vma)) { + if (!vma || vma->vm_start >= end) + break; + /* + * We can't possibly be in a hugetlb VMA. In general, + * for a mm_walk with a pmd_entry and a hugetlb_entry, + * the pmd_entry can only be called on addresses in a + * hugetlb if the walk starts in a non-hugetlb VMA and + * spans a hugepage VMA. Since pagemap_read walks are + * PMD-sized and PMD-aligned, this will never be true. + */ + BUG_ON(is_vm_hugetlb_page(vma)); + + /* Addresses in the VMA. */ + for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + pagemap_entry_t pme; pte = pte_offset_map(pmd, addr); pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); - /* unmap before userspace copy */ pte_unmap(pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; } - err = add_to_pagemap(addr, &pme, pm); - if (err) - return err; + + if (addr == end) + break; + + vma = find_vma(walk->mm, addr); } cond_resched(); @@ -1391,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; - struct task_struct *task = proc_priv->task; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; @@ -1411,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol); - mpol_cond_put(pol); + pol = __get_vma_policy(vma, vma->vm_start); + if (pol) { + mpol_to_str(buffer, sizeof(buffer), pol); + mpol_cond_put(pol); + } else { + mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); + } seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1423,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { - pid_t tid = vm_is_stack(task, vma, is_pid); + pid_t tid = pid_of_stack(proc_priv, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or @@ -1471,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_printf(m, " N%d=%lu", nid, md->node[nid]); out: seq_putc(m, '\n'); - - if (m->count < m->size) - m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + m_cache_vma(m, vma); return 0; } @@ -1504,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = { static int numa_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { - struct numa_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->proc_maps.pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } - } - return ret; + return proc_maps_open(inode, file, ops, + sizeof(struct numa_maps_private)); } static int pid_numa_maps_open(struct inode *inode, struct file *file) @@ -1534,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_numa_maps_operations = { .open = tid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; #endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 678455d2d683..599ec2e20104 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm, return size; } +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + } + rcu_read_unlock(); + + return ret; +} + /* * display a single VMA to a sequenced file */ @@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } else if (mm) { - pid_t tid = vm_is_stack(priv->task, vma, is_pid); + pid_t tid = pid_of_stack(priv, vma, is_pid); if (tid != 0) { seq_pad(m, ' '); @@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos) loff_t n = *pos; /* pin the task and mm whilst we play with them */ - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_access(priv->task, PTRACE_MODE_READ); - if (!mm || IS_ERR(mm)) { - put_task_struct(priv->task); - priv->task = NULL; - return mm; - } - down_read(&mm->mmap_sem); + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) return p; + + up_read(&mm->mmap_sem); + mmput(mm); return NULL; } @@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml) { struct proc_maps_private *priv = m->private; + if (!IS_ERR_OR_NULL(_vml)) { + up_read(&priv->mm->mmap_sem); + mmput(priv->mm); + } if (priv->task) { - struct mm_struct *mm = priv->task->mm; - up_read(&mm->mmap_sem); - mmput(mm); put_task_struct(priv->task); + priv->task = NULL; } } @@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { struct proc_maps_private *priv; - int ret = -ENOMEM; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } + + priv = __seq_open_private(file, ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; } - return ret; + + return 0; +} + + +static int map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); } static int pid_maps_open(struct inode *inode, struct file *file) @@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = map_release, }; const struct file_operations proc_tid_maps_operations = { .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = map_release, }; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c new file mode 100644 index 000000000000..59075b509df3 --- /dev/null +++ b/fs/proc/thread_self.c @@ -0,0 +1,85 @@ +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" + +/* + * /proc/thread_self: + */ +static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF]; + if (!pid) + return -ENOENT; + sprintf(tmp, "%d/task/%d", tgid, pid); + return readlink_copy(buffer, buflen, tmp); +} + +static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (pid) { + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d/task/%d", tgid, pid); + } + nd_set_link(nd, name); + return NULL; +} + +static const struct inode_operations proc_thread_self_inode_operations = { + .readlink = proc_thread_self_readlink, + .follow_link = proc_thread_self_follow_link, + .put_link = kfree_put_link, +}; + +static unsigned thread_self_inum; + +int proc_setup_thread_self(struct super_block *s) +{ + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *thread_self; + + mutex_lock(&root_inode->i_mutex); + thread_self = d_alloc_name(s->s_root, "thread-self"); + if (thread_self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = thread_self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_thread_self_inode_operations; + d_add(thread_self, inode); + } else { + dput(thread_self); + thread_self = ERR_PTR(-ENOMEM); + } + } else { + thread_self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(thread_self)) { + pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); + return PTR_ERR(thread_self); + } + ns->proc_thread_self = thread_self; + return 0; +} + +void __init proc_thread_self_init(void) +{ + proc_alloc_inum(&thread_self_inum); +} diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 382aa890e228..a90d6d354199 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU +/* + * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages + * reported as not being ram with the zero page. + * + * @vma: vm_area_struct describing requested mapping + * @from: start remapping from + * @pfn: page frame number to start remapping to + * @size: remapping size + * @prot: protection bits + * + * Returns zero on success, -EAGAIN on failure. + */ +static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long map_size; + unsigned long pos_start, pos_end, pos; + unsigned long zeropage_pfn = my_zero_pfn(0); + size_t len = 0; + + pos_start = pfn; + pos_end = pfn + (size >> PAGE_SHIFT); + + for (pos = pos_start; pos < pos_end; ++pos) { + if (!pfn_is_ram(pos)) { + /* + * We hit a page which is not ram. Remap the continuous + * region between pos_start and pos-1 and replace + * the non-ram page at pos with the zero page. + */ + if (pos > pos_start) { + /* Remap continuous region */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, + pos_start, map_size, + prot)) + goto fail; + len += map_size; + } + /* Remap the zero page */ + if (remap_oldmem_pfn_range(vma, from + len, + zeropage_pfn, + PAGE_SIZE, prot)) + goto fail; + len += PAGE_SIZE; + pos_start = pos + 1; + } + } + if (pos > pos_start) { + /* Remap the rest */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, pos_start, + map_size, prot)) + goto fail; + } + return 0; +fail: + do_munmap(vma->vm_mm, from, len); + return -EAGAIN; +} + +static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + /* + * Check if oldmem_pfn_is_ram was registered to avoid + * looping over all pages without a reason. + */ + if (oldmem_pfn_is_ram) + return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range(vma, from, pfn, size, prot); +} + static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; @@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min_t(size_t, m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; - if (remap_oldmem_pfn_range(vma, vma->vm_start + len, - paddr >> PAGE_SHIFT, tsz, - vma->vm_page_prot)) + if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) goto fail; size -= tsz; start += tsz; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 1a81373947f3..73ca1740d839 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file, if (!task) goto err; - rcu_read_lock(); - nsp = task_nsproxy(task); + task_lock(task); + nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { - rcu_read_unlock(); + task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); - rcu_read_unlock(); - task_lock(task); if (!task->fs) { task_unlock(task); put_task_struct(task); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 34a1e5aa848c..9d7b9a83699e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) prot = pgprot_noncached(PAGE_KERNEL); - pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { pr_err("%s: Failed to allocate array for %u pages\n", __func__, page_count); diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile index 9dd06199afc9..5e6bae6fae50 100644 --- a/fs/qnx6/Makefile +++ b/fs/qnx6/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6.o qnx6-objs := inode.o dir.o namei.o super_mmi.o +ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 15b7d92ed60d..8d64bb5366bf 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode, if (de->de_size != 0xff) { /* error - long filename entries always have size 0xff in direntry */ - printk(KERN_ERR "qnx6: invalid direntry size (%i).\n", - de->de_size); + pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } lf = qnx6_longname(s, de, &page); if (IS_ERR(lf)) { - printk(KERN_ERR "qnx6:Error reading longname\n"); + pr_err("Error reading longname\n"); return 0; } lf_size = fs16_to_cpu(sbi, lf->lf_size); if (lf_size > QNX6_LONG_NAME_MAX) { - QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname)); - printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size); + pr_debug("file %s\n", lf->lf_fname); + pr_err("Filename too long (%i)\n", lf_size); qnx6_put_page(page); return 0; } @@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode, mmi 3g filesystem does not have that checksum */ if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != qnx6_lfile_checksum(lf->lf_fname, lf_size)) - printk(KERN_INFO "qnx6: long filename checksum error.\n"); + pr_info("long filename checksum error.\n"); - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", - lf_size, lf->lf_fname, de_inode)); + pr_debug("qnx6_readdir:%.*s inode:%u\n", + lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { qnx6_put_page(page); return 0; @@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) int i = start; if (IS_ERR(page)) { - printk(KERN_ERR "qnx6_readdir: read failed\n"); + pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; return PTR_ERR(page); } @@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) break; } } else { - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" - " inode:%u\n", size, de->de_fname, - no_inode)); + pr_debug("%s():%.*s inode:%u\n", + __func__, size, de->de_fname, + no_inode); if (!dir_emit(ctx, de->de_fname, size, no_inode, DT_UNKNOWN)) { done = true; @@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, if (ino) goto found; } else - printk(KERN_ERR "qnx6: undefined " - "filename size in inode.\n"); + pr_err("undefined filename size in inode.\n"); } qnx6_put_page(page); } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 65cdaab3ed49..44e73923670d 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, { unsigned phys; - QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n", - inode->i_ino, (unsigned long)iblock)); + pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); if (phys) { @@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, static int qnx6_check_blockptr(__fs32 ptr) { if (ptr == ~(__fs32)0) { - printk(KERN_ERR "qnx6: hit unused blockpointer.\n"); + pr_err("hit unused blockpointer.\n"); return 0; } return 1; @@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) levelptr = no >> bitdelta; if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { - printk(KERN_ERR "qnx6:Requested file block number (%u) too big.", - no); + pr_err("Requested file block number (%u) too big.", no); return 0; } @@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) for (i = 0; i < depth; i++) { bh = sb_bread(s, block); if (!bh) { - printk(KERN_ERR "qnx6:Error reading block (%u)\n", - block); + pr_err("Error reading block (%u)\n", block); return 0; } bitdelta -= ptrbits; @@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) { struct qnx6_sb_info *sbi = QNX6_SB(s); - QNX6DEBUG((KERN_INFO "magic: %08x\n", - fs32_to_cpu(sbi, sb->sb_magic))); - QNX6DEBUG((KERN_INFO "checksum: %08x\n", - fs32_to_cpu(sbi, sb->sb_checksum))); - QNX6DEBUG((KERN_INFO "serial: %llx\n", - fs64_to_cpu(sbi, sb->sb_serial))); - QNX6DEBUG((KERN_INFO "flags: %08x\n", - fs32_to_cpu(sbi, sb->sb_flags))); - QNX6DEBUG((KERN_INFO "blocksize: %08x\n", - fs32_to_cpu(sbi, sb->sb_blocksize))); - QNX6DEBUG((KERN_INFO "num_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_inodes))); - QNX6DEBUG((KERN_INFO "free_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_inodes))); - QNX6DEBUG((KERN_INFO "num_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_blocks))); - QNX6DEBUG((KERN_INFO "free_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_blocks))); - QNX6DEBUG((KERN_INFO "inode_levels: %02x\n", - sb->Inode.levels)); + pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); + pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); + pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); + pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); + pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); + pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); + pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); + pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); + pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); + pr_debug("inode_levels: %02x\n", sb->Inode.levels); } #endif @@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, start with the first superblock */ bh = sb_bread(s, offset); if (!bh) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); return NULL; } sb = (struct qnx6_super_block *)bh->b_data; @@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, sbi->s_bytesex = BYTESEX_BE; if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ - QNX6DEBUG((KERN_INFO "qnx6: fs got different" - " endianness.\n")); + pr_debug("fs got different endianness.\n"); return bh; } else sbi->s_bytesex = BYTESEX_LE; if (!silent) { if (offset == 0) { - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); } else { - printk(KERN_INFO "qnx6: wrong signature (magic)" - " at position (0x%lx) - will try" - " alternative position (0x0000).\n", - offset * s->s_blocksize); + pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", + offset * s->s_blocksize); } } brelse(bh); @@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* Superblock always is 512 Byte long */ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto outnobh; } /* parse the mount-options */ if (!qnx6_parse_options((char *) data, s)) { - printk(KERN_ERR "qnx6: invalid mount options.\n"); + pr_err("invalid mount options.\n"); goto outnobh; } if (test_opt(s, MMI_FS)) { @@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* try again without bootblock offset */ bh1 = qnx6_check_first_superblock(s, 0, silent); if (!bh1) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); goto outnobh; } /* seems that no bootblock at partition start */ @@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* next the second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #2 checksum error\n"); + pr_err("superblock #2 checksum error\n"); goto out; } @@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } mmi_success: /* sanity check - limit maximum indirect pointer levels */ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); + pr_err("too many inode levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); goto out; } if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many longfilename levels" - " (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); + pr_err("too many longfilename levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); goto out; } s->s_op = &qnx6_sops; @@ -460,7 +442,7 @@ mmi_success: /* prefetch root inode */ root = qnx6_iget(s, QNX6_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "qnx6: get inode failed\n"); + pr_err("get inode failed\n"); ret = PTR_ERR(root); goto out2; } @@ -474,7 +456,7 @@ mmi_success: errmsg = qnx6_checkroot(s); if (errmsg != NULL) { if (!silent) - printk(KERN_ERR "qnx6: %s\n", errmsg); + pr_err("%s\n", errmsg); goto out3; } return 0; @@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mode = 0; if (ino == 0) { - printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is " - "out of range\n", + pr_err("bad inode number on dev %s: %u is out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); @@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) mapping = sbi->inodes->i_mapping; page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) { - printk(KERN_ERR "qnx6: major problem: unable to read inode from " - "dev %s\n", sb->s_id); + pr_err("major problem: unable to read inode from dev %s\n", + sb->s_id); iget_failed(inode); return ERR_CAST(page); } @@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void) return err; } - printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n"); + pr_info("QNX6 filesystem 1.0.0 registered.\n"); return 0; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 0561326a94f5..6c1a323137dd 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, foundinode = qnx6_iget(dir->i_sb, ino); qnx6_put_page(page); if (IS_ERR(foundinode)) { - QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> " - " error %ld\n", PTR_ERR(foundinode))); + pr_debug("lookup->iget -> error %ld\n", + PTR_ERR(foundinode)); return ERR_CAST(foundinode); } } else { - QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name)); + pr_debug("%s(): not found %s\n", __func__, name); return NULL; } d_add(dentry, foundinode); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index b00fcc960d37..d3fb2b698800 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -10,6 +10,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/pagemap.h> @@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64; #include <linux/qnx6_fs.h> -#ifdef CONFIG_QNX6FS_DEBUG -#define QNX6DEBUG(X) printk X -#else -#define QNX6DEBUG(X) (void) 0 -#endif - struct qnx6_sb_info { struct buffer_head *sb_buf; /* superblock buffer */ struct qnx6_super_block *sb; /* our superblock */ diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c index 29c32cba62d6..62aaf3e3126a 100644 --- a/fs/qnx6/super_mmi.c +++ b/fs/qnx6/super_mmi.c @@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) start with the first superblock */ bh1 = sb_bread(s, 0); if (!bh1) { - printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n"); + pr_err("Unable to read first mmi superblock\n"); return NULL; } sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; sbi = QNX6_SB(s); if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) { - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); goto out; } } @@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } @@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* read second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); if (!qsb) { - printk(KERN_ERR "qnx6: unable to allocate memory.\n"); + pr_err("unable to allocate memory.\n"); goto out; } @@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ qnx6_mmi_copy_sb(qsb, sb2); @@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } kfree(qsb); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 7f30bdc57d13..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -96,13 +96,16 @@ * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * - * Any operation working on dquots via inode pointers must hold dqptr_sem. If - * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock. + * Operation accessing dquots via inode pointers are protected by dquot_srcu. + * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and + * synchronize_srcu(&dquot_srcu) is called after clearing pointers from + * inode and before dropping dquot references to avoid use of dquots after + * they are freed. dq_data_lock is used to serialize the pointer setting and + * clearing operations. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have - * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they - * have to do all pointer modifications before dropping dqptr_sem. This makes + * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dq_data_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * @@ -116,21 +119,15 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > - * dqio_mutex + * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. - * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > - * dqptr_sem. But filesystem has to count with the fact that functions such as - * dquot_alloc_space() acquire dqptr_sem and they usually have to be called - * from inside a transaction to keep filesystem consistency after a crash. Also - * filesystems usually want to do some IO on dquot from ->mark_dirty which is - * called with dqptr_sem held. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); +DEFINE_STATIC_SRCU(dquot_srcu); void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) @@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = { /* * Put reference to dquot - * NOTE: If you change this function please check whether dqput_blocks() works right... */ void dqput(struct dquot *dquot) { @@ -963,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type) } /* - * Return 0 if dqput() won't block. - * (note that 1 doesn't necessarily mean blocking) - */ -static inline int dqput_blocks(struct dquot *dquot) -{ - if (atomic_read(&dquot->dq_count) <= 1) - return 1; - return 0; -} - -/* * Remove references to dquots from inode and add dquot to list for freeing * if we have the last reference to dquot - * We can't race with anybody because we hold dqptr_sem for writing... */ -static int remove_inode_dquot_ref(struct inode *inode, int type, - struct list_head *tofree_head) +static void remove_inode_dquot_ref(struct inode *inode, int type, + struct list_head *tofree_head) { struct dquot *dquot = inode->i_dquot[type]; inode->i_dquot[type] = NULL; - if (dquot) { - if (dqput_blocks(dquot)) { -#ifdef CONFIG_QUOTA_DEBUG - if (atomic_read(&dquot->dq_count) != 1) - quota_error(inode->i_sb, "Adding dquot with " - "dq_count %d to dispose list", - atomic_read(&dquot->dq_count)); -#endif - spin_lock(&dq_list_lock); - /* As dquot must have currently users it can't be on - * the free list... */ - list_add(&dquot->dq_free, tofree_head); - spin_unlock(&dq_list_lock); - return 1; - } - else - dqput(dquot); /* We have guaranteed we won't block */ + if (!dquot) + return; + + if (list_empty(&dquot->dq_free)) { + /* + * The inode still has reference to dquot so it can't be in the + * free list + */ + spin_lock(&dq_list_lock); + list_add(&dquot->dq_free, tofree_head); + spin_unlock(&dq_list_lock); + } else { + /* + * Dquot is already in a list to put so we won't drop the last + * reference here. + */ + dqput(dquot); } - return 0; } /* @@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type, * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking - * (dqptr_sem). + * (dq_data_lock). */ + spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; remove_inode_dquot_ref(inode, type, tofree_head); } + spin_unlock(&dq_data_lock); } spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG @@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type) LIST_HEAD(tofree_head); if (sb->dq_op) { - down_write(&sb_dqopt(sb)->dqptr_sem); remove_dquot_ref(sb, type, &tofree_head); - up_write(&sb_dqopt(sb)->dqptr_sem); + synchronize_srcu(&dquot_srcu); put_dquot_list(&tofree_head); } } @@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode) /* * Initialize quota pointers in inode * - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. - * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static void __dquot_initialize(struct inode *inode, int type) { - int cnt; + int cnt, init_needed = 0; struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; @@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type) got[cnt] = NULL; if (type != -1 && cnt != type) continue; + /* + * The i_dquot should have been initialized in most cases, + * we check it without locking here to avoid unnecessary + * dqget()/dqput() calls. + */ + if (inode->i_dquot[cnt]) + continue; + init_needed = 1; + switch (cnt) { case USRQUOTA: qid = make_kqid_uid(inode->i_uid); @@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type) got[cnt] = dqget(sb, qid); } - down_write(&sb_dqopt(sb)->dqptr_sem); + /* All required i_dquot has been initialized */ + if (!init_needed) + return; + + spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) { - spin_lock(&dq_data_lock); + if (unlikely(rsv)) dquot_resv_space(inode->i_dquot[cnt], rsv); - spin_unlock(&dq_data_lock); - } } } out_err: - up_write(&sb_dqopt(sb)->dqptr_sem); + spin_unlock(&dq_data_lock); /* Drop unused references */ dqput_all(got); } @@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode) EXPORT_SYMBOL(dquot_initialize); /* - * Release all quotas referenced by inode + * Release all quotas referenced by inode. + * + * This function only be called on inode free or converting + * a file to quota file, no other users for the i_dquot in + * both cases, so we needn't call synchronize_srcu() after + * clearing i_dquot. */ static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = inode->i_dquot[cnt]; inode->i_dquot[cnt] = NULL; } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&dq_data_lock); dqput_all(put); } @@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) */ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { - int cnt, ret = 0; + int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); goto out; @@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) goto out_flush_warn; mark_all_dquot_dirty(dquots); out_flush_warn: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); out: return ret; @@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space); */ int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = 0; + int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1685,7 +1674,7 @@ warn_put_all: spin_unlock(&dq_data_lock); if (ret == 0) mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; } @@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode); */ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { - int cnt; + int cnt, index; if (!dquot_active(inode)) { inode_claim_rsv_space(inode, number); return 0; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); return 0; } EXPORT_SYMBOL(dquot_claim_space_nodirty); @@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { - int cnt; + int cnt, index; if (!dquot_active(inode)) { inode_reclaim_rsv_space(inode, number); return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) inode_reclaim_rsv_space(inode, number); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); return; } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); @@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot **dquots = inode->i_dquot; - int reserve = flags & DQUOT_SPACE_RESERVE; + int reserve = flags & DQUOT_SPACE_RESERVE, index; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) { inode_decr_space(inode, number, reserve); return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) goto out_unlock; mark_all_dquot_dirty(dquots); out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); @@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode) unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; + int index; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode) } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); @@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. * + * We are holding reference on transfer_from & transfer_to, no need to + * protect them by srcu_read_lock(). */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { @@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ @@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&dq_data_lock); return 0; } - spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; @@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) inode->i_dquot[cnt] = transfer_to[cnt]; } spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); @@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) return 0; over_quota: spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(warn_to); return ret; } @@ -2741,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c index 2f97b0e2c501..ebc5e6285800 100644 --- a/fs/quota/kqid.c +++ b/fs/quota/kqid.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt); /** * from_kqid - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. - * @kuid: The kernel internal quota identifier to start with. + * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 72d29177998e..bb2869f5dfd8 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = { /** * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded + * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ff3f0b3cfdb3..75621649dbd7 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; - down_read(&sb_dqopt(sb)->dqptr_sem); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index dda012ad4208..bbafbde3471a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /* gang-find the pages */ ret = -ENOMEM; - pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto out_free; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index d9f5a60dd59b..0a7dc941aaf4 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -9,7 +9,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> extern const struct reiserfs_key MIN_KEY; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 54fdf196bfb2..9c02d96d3a42 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -10,7 +10,7 @@ * and using buffers obtained after all above. */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/time.h> #include "reiserfs.h" #include <linux/buffer_head.h> @@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) return 0; } -static void balance_leaf_insert_left(struct tree_balance *tb, - struct item_head *ih, const char *body) +static unsigned int balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *const ih, + const char * const body) { int ret; struct buffer_info bi; int n = B_NR_ITEMS(tb->L[0]); + unsigned body_shift_bytes = 0; if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ @@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb, put_ih_item_len(ih, new_item_len); if (tb->lbytes > tb->zeroes_num) { - body += (tb->lbytes - tb->zeroes_num); + body_shift_bytes = tb->lbytes - tb->zeroes_num; tb->zeroes_num = 0; } else tb->zeroes_num -= tb->lbytes; @@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb, tb->insert_size[0] = 0; tb->zeroes_num = 0; } + return body_shift_bytes; } static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { int n = B_NR_ITEMS(tb->L[0]); struct buffer_info bi; @@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, tb->pos_in_item -= tb->lbytes; } -static void balance_leaf_paste_left_shift(struct tree_balance *tb, - struct item_head *ih, - const char *body) +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tb->L[0]); struct buffer_info bi; + int body_shift_bytes = 0; if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { balance_leaf_paste_left_shift_dirent(tb, ih, body); - return; + return 0; } RFALSE(tb->lbytes <= 0, @@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb, * insert_size[0] */ if (l_n > tb->zeroes_num) { - body += (l_n - tb->zeroes_num); + body_shift_bytes = l_n - tb->zeroes_num; tb->zeroes_num = 0; } else tb->zeroes_num -= l_n; @@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb, */ leaf_shift_left(tb, tb->lnum[0], tb->lbytes); } + return body_shift_bytes; } /* appended item will be in L[0] in whole */ static void balance_leaf_paste_left_whole(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tb->L[0]); @@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb, tb->zeroes_num = 0; } -static void balance_leaf_paste_left(struct tree_balance *tb, - struct item_head *ih, const char *body) +static unsigned int balance_leaf_paste_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) { /* we must shift the part of the appended item */ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) - balance_leaf_paste_left_shift(tb, ih, body); + return balance_leaf_paste_left_shift(tb, ih, body); else balance_leaf_paste_left_whole(tb, ih, body); + return 0; } /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ -static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag) +static unsigned int balance_leaf_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) { if (tb->lnum[0] <= 0) - return; + return 0; /* new item or it part falls to L[0], shift it too */ if (tb->item_pos < tb->lnum[0]) { BUG_ON(flag != M_INSERT && flag != M_PASTE); if (flag == M_INSERT) - balance_leaf_insert_left(tb, ih, body); + return balance_leaf_insert_left(tb, ih, body); else /* M_PASTE */ - balance_leaf_paste_left(tb, ih, body); + return balance_leaf_paste_left(tb, ih, body); } else /* new item doesn't fall into L[0] */ leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + return 0; } static void balance_leaf_insert_right(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); @@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb, static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, } static void balance_leaf_paste_right_shift(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n_shift, n_rem, r_zeroes_number, version; @@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb, } static void balance_leaf_paste_right_whole(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); @@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb, } static void balance_leaf_paste_right(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); @@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb, } /* shift rnum[0] items from S[0] to the right neighbor R[0] */ -static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag) +static void balance_leaf_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) { if (tb->rnum[0] <= 0) return; @@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, } static void balance_leaf_new_nodes_insert(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, /* we append to directory item */ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb, /* Fill new nodes that appear in place of S[0] */ static void balance_leaf_new_nodes(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int flag) @@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb, } static void balance_leaf_finish_node_insert(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb, } static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct item_head *pasted = item_head(tbS0, tb->item_pos); @@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, } static void balance_leaf_finish_node_paste(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb, * of the affected item which remains in S */ static void balance_leaf_finish_node(struct tree_balance *tb, - struct item_head *ih, - const char *body, int flag) + struct item_head * const ih, + const char * const body, int flag) { /* if we must insert or append into buffer S[0] */ if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { @@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) tb->pos_in_item *= UNFM_P_SIZE; - balance_leaf_left(tb, ih, body, flag); + body += balance_leaf_left(tb, ih, body, flag); /* tb->lnum[0] > 0 */ /* Calculate new item position */ diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index db9e80ba53a0..751dd3f4346b 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -6,7 +6,7 @@ #include "reiserfs.h" #include "acl.h" #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index 73231b1ebdbe..b751eea32e20 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 63b2b0ec49e6..a7eec9888f10 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -11,7 +11,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/buffer_head.h> #include <linux/mpage.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 501ed6811a2b..6ec8a30a0911 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -7,7 +7,7 @@ #include <linux/mount.h> #include "reiserfs.h" #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/compat.h> diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index cfaee912ee09..aca73dd73906 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item) } else { struct stat_data *sd = (struct stat_data *)item; - printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); } @@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item) namebuf[namelen + 2] = 0; } - printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", i, namebuf, deh_dir_id(deh), deh_objectid(deh), GET_HASH_VALUE(deh_offset(deh)), diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e8870de4627e..a88b1b3e7db3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1947,8 +1947,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, } } - /* wait for all commits to finish */ - cancel_delayed_work(&SB_JOURNAL(sb)->j_work); /* * We must release the write lock here because @@ -1956,8 +1954,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, */ reiserfs_write_unlock(sb); + /* + * Cancel flushing of old commits. Note that neither of these works + * will be requeued because superblock is being shutdown and doesn't + * have MS_ACTIVE set. + */ cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); - flush_workqueue(REISERFS_SB(sb)->commit_wq); + /* wait for all commits to finish */ + cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); free_journal_ram(sb); @@ -4292,9 +4296,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) if (flush) { flush_commit_list(sb, jl, 1); flush_journal_list(sb, jl, 1); - } else if (!(jl->j_state & LIST_COMMIT_PENDING)) - queue_delayed_work(REISERFS_SB(sb)->commit_wq, - &journal->j_work, HZ / 10); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { + /* + * Avoid queueing work when sb is being shut down. Transaction + * will be flushed on journal shutdown. + */ + if (sb->s_flags & MS_ACTIVE) + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); + } /* * if the next transaction has any chance of wrapping, flush diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index d6744c8b24e1..249594a821e0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" @@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, /* insert item into the leaf node in position before */ void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head *inserted_item_ih, - const char *inserted_item_body, int zeros_number) + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number) { struct buffer_head *bh = bi->bi_bh; int nr, free_space; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index c9b47e91baf8..ae1dc841db3a 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -17,7 +17,7 @@ static char off_buf[80]; static char *reiserfs_cpu_offset(struct cpu_key *key) { if (cpu_key_k_type(key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(cpu_key_k_offset(key)), (unsigned long long) @@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key) version = le_key_version(key); if (le_key_k_type(version, key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(le_key_k_offset(version, key)), (unsigned long long) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 02b0b7d0f7d5..621b9f381fe1 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -11,7 +11,7 @@ #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index bf53888c7f59..1894d96ccb7c 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data { } reiserfs_proc_info_data_t; #endif +/* Number of quota types we support */ +#define REISERFS_MAXQUOTAS 2 + /* reiserfs union of in-core super block data */ struct reiserfs_sb_info { /* Buffer containing the super block */ @@ -615,7 +618,7 @@ struct reiserfs_sb_info { spinlock_t old_work_lock; /* protects old_work and work_queued */ #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[REISERFS_MAXQUOTAS]; int s_jquota_fmt; #endif char *s_jdev; /* Stored jdev for mount option showing */ @@ -3216,11 +3219,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, int del_num, int del_bytes); void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head *inserted_item_ih, - const char *inserted_item_body, int zeros_number); -void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, - int pos_in_item, int paste_size, const char *body, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, int zeros_number); +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, + int pos_in_item, int paste_size, + const char * const body, int zeros_number); void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, int pos_in_item, int cut_size); void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index dd44468edc2b..24cbe013240f 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, &s_search_path) == POSITION_FOUND); RFALSE(file_size > ROUND_UP(new_file_size), - "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", new_file_size, file_size, s_item_key.on_disk_key.k_objectid); update_and_out: diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a392cef6acc6..f1376c92cf74 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -15,7 +15,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include "acl.h" #include "xattr.h" @@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s) struct reiserfs_sb_info *sbi = REISERFS_SB(s); unsigned long delay; - if (s->s_flags & MS_RDONLY) + /* + * Avoid scheduling flush when sb is being shut down. It can race + * with journal shutdown and free still queued delayed work. + */ + if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE)) return; spin_lock(&sbi->old_work_lock); @@ -202,7 +206,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA int i; int ms_active_set; - int quota_enabled[MAXQUOTAS]; + int quota_enabled[REISERFS_MAXQUOTAS]; #endif /* compose key to look for "save" links */ @@ -223,7 +227,7 @@ static int finish_unfinished(struct super_block *s) s->s_flags |= MS_ACTIVE; } /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { quota_enabled[i] = 1; if (REISERFS_SB(s)->s_qf_names[i]) { int ret; @@ -331,7 +335,7 @@ static int finish_unfinished(struct super_block *s) * not completed truncate found. New size was * committed together with "save" link */ - reiserfs_info(s, "Truncating %k to %Ld ..", + reiserfs_info(s, "Truncating %k to %lld ..", INODE_PKEY(inode), inode->i_size); /* don't update modification time */ @@ -366,7 +370,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ reiserfs_write_unlock(s); - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { if (sb_dqopt(s)->files[i] && quota_enabled[i]) dquot_quota_off(s, i); } @@ -1356,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names, { int i; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) kfree(REISERFS_SB(s)->s_qf_names[i]); REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; @@ -1377,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) struct reiserfs_journal *journal = SB_JOURNAL(s); char *new_opts = kstrdup(arg, GFP_KERNEL); int err; - char *qf_names[MAXQUOTAS]; + char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; @@ -1396,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) (s, arg, &mount_options, &blocks, NULL, &commit_max_age, qf_names, &qfmt)) { #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < REISERFS_MAXQUOTAS; i++) if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) kfree(qf_names[i]); #endif @@ -1577,7 +1581,7 @@ static int read_super_block(struct super_block *s, int offset) rs = (struct reiserfs_super_block *)bh->b_data; if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " - "filesystem on (dev %s, block %Lu, size %lu)", + "filesystem on (dev %s, block %llu, size %lu)", s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); @@ -1840,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) char *jdev_name; struct reiserfs_sb_info *sbi; int errval = -EINVAL; - char *qf_names[MAXQUOTAS] = {}; + char *qf_names[REISERFS_MAXQUOTAS] = {}; unsigned int qfmt = 0; save_mount_options(s, data); @@ -2165,7 +2169,7 @@ error_unlocked: #ifdef CONFIG_QUOTA { int j; - for (j = 0; j < MAXQUOTAS; j++) + for (j = 0; j < REISERFS_MAXQUOTAS; j++) kfree(qf_names[j]); } #endif @@ -2441,8 +2445,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh, *bh; if (!current->journal_info) { - printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); return -EIO; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index ca416d099e7d..7c36898af402 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -45,7 +45,7 @@ #include <linux/xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/checksum.h> #include <linux/stat.h> #include <linux/quotaops.h> @@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) { struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; + if (!privroot->d_inode) return ERR_PTR(-ENODATA); @@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) xaroot = ERR_PTR(-ENODATA); else if (!xaroot->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(privroot->d_inode, xaroot, 0700); if (err) { @@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && !xadir->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(xaroot->d_inode, xadir, 0700); if (err) { @@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, { struct reiserfs_dentry_buf *dbuf = buf; struct dentry *dentry; + WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) @@ -218,6 +224,7 @@ static void cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) { int i; + for (i = 0; i < buf->count; i++) if (buf->dentries[i]) dput(buf->dentries[i]); @@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); err = journal_begin(&th, inode->i_sb, blocks); reiserfs_write_unlock(inode->i_sb); if (!err) { int jerror; + mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, I_MUTEX_XATTR); err = action(dir, data); @@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) int reiserfs_delete_xattrs(struct inode *inode) { int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + if (err) reiserfs_warning(inode->i_sb, "jdm-20004", "Couldn't delete all xattrs (%d)\n", err); @@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode) int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) { int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + if (err) reiserfs_warning(inode->i_sb, "jdm-20007", "Couldn't chown all xattrs (%d)\n", err); @@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec now = current_fs_time(inode->i_sb); + if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; @@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, size_t chunk; size_t skip = 0; size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (file_pos == 0) { struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof(struct reiserfs_xattr_header); if (chunk + skip > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE - skip; @@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, size_t chunk; char *data; size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, int reiserfs_removexattr(struct dentry *dentry, const char *name) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen, { struct listxattr_buf *b = (struct listxattr_buf *)buf; size_t size; + if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry) { int err; struct inode *inode = dentry->d_parent->d_inode; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); @@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) mutex_lock(&privroot->d_inode->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; + dentry = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); if (!IS_ERR(dentry)) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 44503e293790..4b34b9dc03dd 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -9,7 +9,7 @@ #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 800a3cef6f62..e7f8939a4cb5 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -6,7 +6,7 @@ #include <linux/slab.h> #include "xattr.h" #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a0035719f66b..5eeb0c48ba46 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -5,7 +5,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 8667491ae7c3..e50eab046471 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -4,7 +4,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ef90e8bca95a..e98dd88197d5 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -56,6 +56,8 @@ * 2 of the Licence, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> @@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) eio: ret = -EIO; error: - printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + pr_err("read error for inode 0x%lx\n", pos); return ERR_PTR(ret); } @@ -390,6 +392,7 @@ error: static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } @@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) static void romfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } @@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { if (!silent) - printk(KERN_WARNING "VFS:" - " Can't find a romfs filesystem on dev %s.\n", + pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { - printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", - sb->s_id); + pr_err("bad initial checksum on dev %s.\n", sb->s_id); goto error_rsb_inval; } @@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) len = strnlen(rsb->name, ROMFS_MAXFN); if (!silent) - printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", - (unsigned) len, (unsigned) len, rsb->name, storage); + pr_notice("Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); kfree(rsb); rsb = NULL; @@ -614,7 +616,7 @@ static int __init init_romfs_fs(void) { int ret; - printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); romfs_inode_cachep = kmem_cache_create("romfs_i", @@ -623,13 +625,12 @@ static int __init init_romfs_fs(void) romfs_i_init_once); if (!romfs_inode_cachep) { - printk(KERN_ERR - "ROMFS error: Failed to initialise inode cache\n"); + pr_err("Failed to initialise inode cache\n"); return -ENOMEM; } ret = register_filesystem(&romfs_fs_type); if (ret) { - printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + pr_err("Failed to register filesystem\n"); goto error_register; } return 0; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 62a0de6632e1..43e7a7eddac0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) pages = end_index - start_index + 1; - page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 031c8d67fd51..5056babe00df 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -27,6 +27,8 @@ * the filesystem. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " - "Phillip Lougher\n"); + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } diff --git a/fs/stack.c b/fs/stack.c index 5b5388250e29..a54e33ed10f1 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * include/linux/fs.h). We don't necessarily hold i_mutex when this * is called, so take i_lock for that case. * - * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock * for that case too, and do both at once by combining the tests. * diff --git a/fs/super.c b/fs/super.c index d20d5b11dedf..1b836107acee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,7 +22,6 @@ #include <linux/export.h> #include <linux/slab.h> -#include <linux/acct.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> @@ -176,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); @@ -218,7 +218,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; @@ -702,12 +701,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return -EACCES; #endif - if (flags & MS_RDONLY) - acct_auto_close(sb); - shrink_dcache_sb(sb); - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + if (remount_ro) { + if (sb->s_pins.first) { + up_write(&sb->s_umount); + sb_pin_kill(sb); + down_write(&sb->s_umount); + if (!sb->s_root) + return 0; + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + } + } + shrink_dcache_sb(sb); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { diff --git a/fs/sync.c b/fs/sync.c index b28d1dd10e8b..bdc729d80e5e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb) return ret; return __sync_filesystem(sb, 1); } -EXPORT_SYMBOL_GPL(sync_filesystem); +EXPORT_SYMBOL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { diff --git a/fs/timerfd.c b/fs/timerfd.c index 0013142c0475..b46ffa94372a 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -35,8 +35,9 @@ struct timerfd_ctx { ktime_t moffs; wait_queue_head_t wqh; u64 ticks; - int expired; int clockid; + short unsigned expired; + short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; bool might_cancel; @@ -92,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, */ void timerfd_clock_was_set(void) { - ktime_t moffs = ktime_get_monotonic_offset(); + ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); struct timerfd_ctx *ctx; unsigned long flags; @@ -125,7 +126,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) return false; - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); return true; } @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, if (timerfd_canceled(ctx)) return -ECANCELED; } + + ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } @@ -284,11 +287,76 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show(struct seq_file *m, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + struct itimerspec t; + + spin_lock_irq(&ctx->wqh.lock); + t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + + return seq_printf(m, + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0%o\n" + "it_value: (%llu, %llu)\n" + "it_interval: (%llu, %llu)\n", + ctx->clockid, (unsigned long long)ctx->ticks, + ctx->settime_flags, + (unsigned long long)t.it_value.tv_sec, + (unsigned long long)t.it_value.tv_nsec, + (unsigned long long)t.it_interval.tv_sec, + (unsigned long long)t.it_interval.tv_nsec); +} +#else +#define timerfd_show NULL +#endif + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct timerfd_ctx *ctx = file->private_data; + int ret = 0; + + switch (cmd) { + case TFD_IOC_SET_TICKS: { + u64 ticks; + + if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) + return -EFAULT; + if (!ticks) + return -EINVAL; + + spin_lock_irq(&ctx->wqh.lock); + if (!timerfd_canceled(ctx)) { + ctx->ticks = ticks; + wake_up_locked(&ctx->wqh); + } else + ret = -ECANCELED; + spin_unlock_irq(&ctx->wqh.lock); + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret; +} +#else +#define timerfd_ioctl NULL +#endif + static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, + .show_fdinfo = timerfd_show, + .unlocked_ioctl = timerfd_ioctl, }; static int timerfd_fget(int fd, struct fd *p) @@ -336,7 +404,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index ff8229340cd5..aa13ad053b14 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; - mutex_lock(&c->mst_mutex); c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); @@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info *c) else c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); err = ubifs_write_master(c); - mutex_unlock(&c->mst_mutex); if (err) goto out; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 2290d5866725..fb08b0c514b6 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) /** * wbuf_timer_callback - write-buffer timer callback function. - * @data: timer data (write-buffer descriptor) + * @timer: timer data (write-buffer descriptor) * * This function is called when the write-buffer timer expires. */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index a902c5919e42..a47ddfc9be6b 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -240,6 +240,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); c->lhead_offs = 0; } @@ -404,15 +405,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) /* Switch to the next log LEB */ if (c->lhead_offs) { c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); c->lhead_offs = 0; } - if (c->lhead_offs == 0) { - /* Must ensure next LEB has been unmapped */ - err = ubifs_leb_unmap(c, c->lhead_lnum); - if (err) - goto out; - } + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; len = ALIGN(len, c->min_io_size); dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index d46b19ec1815..421bd0a80424 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) return ERR_CAST(pnode); @@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) return ERR_CAST(pnode); @@ -1964,7 +1962,6 @@ again: } } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = scan_get_pnode(c, path + h, nnode, iip); if (IS_ERR(pnode)) { err = PTR_ERR(pnode); @@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, lprops->dirty); return -EINVAL; } + break; case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { @@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, lprops->dirty); return -EINVAL; } + break; } } return 0; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 45d4e96a6bac..d9c02928e992 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c) ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); } - done_ltab = 1; c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; @@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c) if (err) return err; } - done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; dbg_chk_lpt_sz(c, 1, c->ltab_sz); @@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pr_err("LEB %d:%d, nnode, ", lnum, offs); err = ubifs_unpack_nnode(c, p, &nnode); + if (err) { + pr_err("failed to unpack_node, error %d\n", + err); + break; + } for (i = 0; i < UBIFS_LPT_FANOUT; i++) { pr_cont("%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index ab83ace9910a..1a4bb9e8b3b8 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c) * ubifs_write_master - write master node. * @c: UBIFS file-system description object * - * This function writes the master node. The caller has to take the - * @c->mst_mutex lock before calling this function. Returns zero in case of - * success and a negative error code in case of failure. The master node is - * written twice to enable recovery. + * This function writes the master node. Returns zero in case of success and a + * negative error code in case of failure. The master node is written twice to + * enable recovery. */ int ubifs_write_master(struct ubifs_info *c) { diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index f1c3e5a1b315..4409f486ecef 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic) int lnum; /* Unmap any unused LEBs after consolidation */ - lnum = c->ohead_lnum + 1; for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { err = ubifs_leb_unmap(c, lnum); if (err) diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index c14adb2f420c..c640938f62f0 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) * drop_last_node - drop the last node. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here - * @grouped: non-zero if whole group of nodes have to be dropped * * This is a helper function for 'ubifs_recover_leb()' which drops the last * node of the scanned LEB. @@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) * * This function does a scan of a LEB, but caters for errors that might have * been caused by the unclean unmount from which we are attempting to recover. - * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is - * found, and a negative error code in case of failure. + * Returns the scanned information on success and a negative error code on + * failure. */ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int jhead) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 4c37607a958e..79c6dbbc0e04 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c) cs->ch.node_type = UBIFS_CS_NODE; err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); kfree(cs); + if (err) + return err; ubifs_msg("default file-system created"); return 0; @@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } - if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) { err = 13; goto failed; } diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 58aa05df2bb6..89adbc4d08ac 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, * @offs: offset to start at (usually zero) * @sbuf: scan buffer (must be c->leb_size) * - * This function returns %0 on success and a negative error code on failure. + * This function returns the scanned information on success and a negative error + * code on failure. */ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, int offs, void *sbuf) @@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, return ERR_PTR(err); } - if (err == -EBADMSG) - sleb->ecc = 1; - + /* + * Note, we ignore integrity errors (EBASMSG) because all the nodes are + * protected by CRC checksums. + */ return sleb; } @@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, * @sleb: scanning information * @lnum: logical eraseblock number * @offs: offset to start at (usually zero) - * - * This function returns %0 on success and a negative error code on failure. */ void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, int lnum, int offs) @@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, * @quiet: print no messages * * This function scans LEB number @lnum and returns complete information about - * its contents. Returns the scaned information in case of success and, + * its contents. Returns the scanned information in case of success and, * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case * of failure. * diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3904c8574ef9..106bf20629ce 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 1; } - if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { ubifs_err("unknown compression type %d", ui->compr_type); return 2; } @@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root) struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) - seq_printf(s, ",fast_unmount"); + seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) - seq_printf(s, ",norm_unmount"); + seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) - seq_printf(s, ",bulk_read"); + seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) - seq_printf(s, ",no_bulk_read"); + seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) - seq_printf(s, ",chk_data_crc"); + seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) - seq_printf(s, ",no_chk_data_crc"); + seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", @@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c) { int i, err; - c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), - GFP_KERNEL); + c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead), + GFP_KERNEL); if (!c->jheads) return -ENOMEM; @@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->lp_mutex); mutex_init(&c->tnc_mutex); mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 8a40cf9c02d7..6793db0754f6 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, goto out_unlock; if (err) { - err = -EINVAL; key = &from_key; goto out_dump; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 3600994f8411..7a205e046776 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) ubifs_dump_lprops(c); } /* Try to commit anyway */ - err = 0; break; } p++; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c1f71fe17cc0..c4fe900c67ab 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -314,7 +314,6 @@ struct ubifs_scan_node { * @nodes_cnt: number of nodes scanned * @nodes: list of struct ubifs_scan_node * @endpt: end point (and therefore the start of empty space) - * @ecc: read returned -EBADMSG * @buf: buffer containing entire LEB scanned */ struct ubifs_scan_leb { @@ -322,7 +321,6 @@ struct ubifs_scan_leb { int nodes_cnt; struct list_head nodes; int endpt; - int ecc; void *buf; }; @@ -1051,7 +1049,6 @@ struct ubifs_debug_info; * * @mst_node: master node * @mst_offs: offset of valid master node - * @mst_mutex: protects the master node area, @mst_node, and @mst_offs * * @max_bu_buf_len: maximum bulk-read buffer length * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu @@ -1292,7 +1289,6 @@ struct ubifs_info { struct ubifs_mst_node *mst_node; int mst_offs; - struct mutex mst_mutex; int max_bu_buf_len; struct mutex bu_mutex; diff --git a/fs/udf/file.c b/fs/udf/file.c index d80738fdf424..bb15771b92ae 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -27,7 +27,7 @@ #include "udfdecl.h" #include <linux/fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/string.h> /* memset */ #include <linux/capability.h> @@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file, return 0; } -static int udf_adinicb_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - - kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, - kaddr + offset, copied); - kunmap_atomic(kaddr); - - return simple_write_end(file, mapping, pos, len, copied, page, fsdata); -} - static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) @@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, - .write_end = udf_adinicb_write_end, + .write_end = simple_write_end, .direct_IO = udf_adinicb_direct_IO, }; @@ -241,11 +223,18 @@ out: static int udf_release_file(struct inode *inode, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) { + if (filp->f_mode & FMODE_WRITE && + atomic_read(&inode->i_writecount) > 1) { + /* + * Grab i_mutex to avoid races with writes changing i_size + * while we are running. + */ + mutex_lock(&inode->i_mutex); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6eaf5edf1ea1..e77db621ec89 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode) udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } -struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) +struct inode *udf_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); @@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); struct logicalVolIntegrityDescImpUse *lvidiu; + int err; inode = new_inode(sb); - if (!inode) { - *err = -ENOMEM; - return NULL; - } - *err = -ENOSPC; + if (!inode) + return ERR_PTR(-ENOMEM); iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { @@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) } if (!iinfo->i_ext.i_data) { iput(inode); - *err = -ENOMEM; - return NULL; + return ERR_PTR(-ENOMEM); } + err = -ENOSPC; block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, - start, err); - if (*err) { + start, &err); + if (err) { iput(inode); - return NULL; + return ERR_PTR(err); } lvidiu = udf_sb_lvidiu(sb); if (lvidiu) { iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; mutex_lock(&sbi->s_alloc_mutex); if (S_ISDIR(mode)) le32_add_cpu(&lvidiu->numDirs, 1); @@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; inode->i_mtime = inode->i_atime = inode->i_ctime = iinfo->i_crtime = current_fs_time(inode->i_sb); - insert_inode_hash(inode); + if (unlikely(insert_inode_locked(inode) < 0)) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EIO); + } mark_inode_dirty(inode); - *err = 0; return inode; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 236cd48184c2..c9b4df5810d5 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -51,7 +51,6 @@ MODULE_LICENSE("GPL"); static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); -static void udf_fill_inode(struct inode *, struct buffer_head *); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static sector_t inode_getblk(struct inode *, sector_t, int *, int *); @@ -1271,12 +1270,33 @@ update_time: return 0; } -static void __udf_read_inode(struct inode *inode) +/* + * Maximum length of linked list formed by ICB hierarchy. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_ICB_NESTING 1024 + +static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; + struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + struct kernel_lb_addr *iloc = &iinfo->i_location; + unsigned int link_count; + unsigned int indirections = 0; + int ret = -EIO; + +reread: + if (iloc->logicalBlockNum >= + sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { + udf_debug("block=%d, partition=%d out of range\n", + iloc->logicalBlockNum, iloc->partitionReferenceNum); + return -EIO; + } /* * Set defaults, but the inode is still incomplete! @@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode) * i_nlink = 1 * i_op = NULL; */ - bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); + bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); - make_bad_inode(inode); - return; + return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", inode->i_ino, ident); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; - ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, - &ident); + ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { - struct buffer_head *nbh = NULL; struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); - if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, - &ident))) { - if (ident == TAG_IDENT_FE || - ident == TAG_IDENT_EFE) { - memcpy(&iinfo->i_location, - &loc, - sizeof(struct kernel_lb_addr)); - brelse(bh); - brelse(ibh); - brelse(nbh); - __udf_read_inode(inode); - return; + if (ie->indirectICB.extLength) { + brelse(ibh); + memcpy(&iinfo->i_location, &loc, + sizeof(struct kernel_lb_addr)); + if (++indirections > UDF_MAX_ICB_NESTING) { + udf_err(inode->i_sb, + "too many ICBs in ICB hierarchy" + " (max %d supported)\n", + UDF_MAX_ICB_NESTING); + goto out; } - brelse(nbh); + brelse(bh); + goto reread; } } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %d\n", le16_to_cpu(fe->icbTag.strategyType)); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } - udf_fill_inode(inode, bh); - - brelse(bh); -} - -static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) -{ - struct fileEntry *fe; - struct extendedFileEntry *efe; - struct udf_sb_info *sbi = UDF_SB(inode->i_sb); - struct udf_inode_info *iinfo = UDF_I(inode); - unsigned int link_count; - - fe = (struct fileEntry *)bh->b_data; - efe = (struct extendedFileEntry *)bh->b_data; - if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ @@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - @@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct fileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct fileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); @@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct unallocSpaceEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); - return; + return 0; } + ret = -EIO; read_lock(&sbi->s_cred_lock); i_uid_write(inode, le32_to_cpu(fe->uid)); if (!uid_valid(inode->i_uid) || @@ -1441,8 +1435,13 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); - if (!link_count) + if (!link_count) { + if (!hidden_inode) { + ret = -ESTALE; + goto out; + } link_count = 1; + } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); @@ -1488,6 +1487,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); } + inode->i_generation = iinfo->i_unique; switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: @@ -1537,8 +1537,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) default: udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", inode->i_ino, fe->icbTag.fileType); - make_bad_inode(inode); - return; + goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = @@ -1549,8 +1548,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else - make_bad_inode(inode); + goto out; } + ret = 0; +out: + brelse(bh); + return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) @@ -1664,7 +1667,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); fe->permissions = cpu_to_le32(udfperms); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); @@ -1826,36 +1829,28 @@ out: return err; } -struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) +struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, + bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); + int err; if (!inode) - return NULL; - - if (inode->i_state & I_NEW) { - memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); - __udf_read_inode(inode); - unlock_new_inode(inode); - } + return ERR_PTR(-ENOMEM); - if (is_bad_inode(inode)) - goto out_iput; + if (!(inode->i_state & I_NEW)) + return inode; - if (ino->logicalBlockNum >= UDF_SB(sb)-> - s_partmaps[ino->partitionReferenceNum].s_partition_len) { - udf_debug("block=%d, partition=%d out of range\n", - ino->logicalBlockNum, ino->partitionReferenceNum); - make_bad_inode(inode); - goto out_iput; + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); + err = udf_read_inode(inode, hidden_inode); + if (err < 0) { + iget_failed(inode); + return ERR_PTR(err); } + unlock_new_inode(inode); return inode; - - out_iput: - iput(inode); - return NULL; } int udf_add_aext(struct inode *inode, struct extent_position *epos, diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 6583fe9b0645..6ad5a453af97 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -21,7 +21,7 @@ #include <linux/blkdev.h> #include <linux/cdrom.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "udf_sb.h" diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 9737cba1357d..c12e260fd6c4 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, NULL, 0), }; inode = udf_iget(dir->i_sb, lb); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return inode; } else #endif /* UDF_RECOVERY */ @@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return ERR_CAST(inode); } return d_splice_alias(inode, dentry); @@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } -static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { + struct udf_inode_info *iinfo = UDF_I(inode); + struct inode *dir = dentry->d_parent->d_inode; struct udf_fileident_bh fibh; - struct inode *inode; struct fileIdentDesc cfi, *fi; int err; - struct udf_inode_info *iinfo; - - inode = udf_new_inode(dir, mode, &err); - if (!inode) { - return err; - } - - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; - inode->i_op = &udf_file_inode_operations; - inode->i_fop = &udf_file_operations; - mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + if (unlikely(!fi)) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); + unlock_new_inode(inode); d_instantiate(dentry, inode); return 0; } -static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { - struct inode *inode; - struct udf_inode_info *iinfo; - int err; + struct inode *inode = udf_new_inode(dir, mode); - inode = udf_new_inode(dir, mode, &err); - if (!inode) - return err; + if (IS_ERR(inode)) + return PTR_ERR(inode); - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; else inode->i_data.a_ops = &udf_aops; @@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); + return udf_add_nondir(dentry, inode); +} + +static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = udf_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); d_tmpfile(dentry, inode); + unlock_new_inode(inode); return 0; } @@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; - int err; - struct udf_inode_info *iinfo; if (!old_valid_dev(rdev)) return -EINVAL; - err = -EIO; - inode = udf_new_inode(dir, mode, &err); - if (!inode) - goto out; + inode = udf_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - iinfo = UDF_I(inode); init_special_inode(inode, mode, rdev); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); - iput(inode); - return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - mark_inode_dirty(inode); - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; - -out: - return err; + return udf_add_nondir(dentry, inode); } static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - err = -EIO; - inode = udf_new_inode(dir, S_IFDIR | mode, &err); - if (!inode) - goto out; + inode = udf_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!fi) { clear_nlink(inode); mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); inc_nlink(dir); mark_inode_dirty(dir); + unlock_new_inode(inode); d_instantiate(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -876,14 +850,11 @@ out: static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode; + struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); struct pathComponent *pc; const char *compstart; - struct udf_fileident_bh fibh; struct extent_position epos = {}; int eoffset, elen = 0; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; uint8_t *ea; int err; int block; @@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct udf_inode_info *iinfo; struct super_block *sb = dir->i_sb; - inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); - if (!inode) - goto out; + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); @@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); up_write(&iinfo->i_data_sem); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) - goto out_no_entry; - cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(lvid_get_unique_id(sb)); - } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; - + err = udf_add_nondir(dentry, inode); out: kfree(name); return err; @@ -1037,6 +990,7 @@ out: out_no_entry: up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct udf_fileident_bh fibh; if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) - goto out_unlock; + return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child) tloc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(child->d_inode->i_sb, &tloc); - if (!inode) - goto out_unlock; + if (IS_ERR(inode)) + return ERR_CAST(inode); return d_obtain_alias(inode); -out_unlock: - return ERR_PTR(-EACCES); } @@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, loc.partitionReferenceNum = partref; inode = udf_iget(sb, &loc); - if (inode == NULL) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index 3286db047a40..e229315bbf7a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -63,7 +63,7 @@ #include "udf_i.h" #include <linux/init.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define VDS_POS_PRIMARY_VOL_DESC 0 #define VDS_POS_UNALLOC_SPACE_DESC 1 @@ -959,14 +959,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, addr.logicalBlockNum = meta_file_loc; addr.partitionReferenceNum = partition_num; - metadata_fe = udf_iget(sb, &addr); + metadata_fe = udf_iget_special(sb, &addr); - if (metadata_fe == NULL) + if (IS_ERR(metadata_fe)) { udf_warn(sb, "metadata inode efe not found\n"); - else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + return metadata_fe; + } + if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); iput(metadata_fe); - metadata_fe = NULL; + return ERR_PTR(-EIO); } return metadata_fe; @@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; + struct inode *fe; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; @@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Metadata file location: block = %d part = %d\n", mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, - mdata->s_meta_file_loc, map->s_partition_num); - - if (mdata->s_metadata_fe == NULL) { + fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, + map->s_partition_num); + if (IS_ERR(fe)) { /* mirror file entry */ udf_debug("Mirror metadata file location: block = %d part = %d\n", mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, - mdata->s_mirror_file_loc, map->s_partition_num); + fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, + map->s_partition_num); - if (mdata->s_mirror_fe == NULL) { + if (IS_ERR(fe)) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); - return -EIO; + return PTR_ERR(fe); } - } + mdata->s_mirror_fe = fe; + } else + mdata->s_metadata_fe = fe; + /* * bitmap file entry @@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_bitmap_fe = udf_iget(sb, &addr); - if (mdata->s_bitmap_fe == NULL) { + fe = udf_iget_special(sb, &addr); + if (IS_ERR(fe)) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); - return -EIO; + return PTR_ERR(fe); } - } + } else + mdata->s_bitmap_fe = fe; } udf_debug("udf_load_metadata_files Ok\n"); @@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_uspace.s_table = udf_iget(sb, &loc); - if (!map->s_uspace.s_table) { + inode = udf_iget_special(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } + map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", p_index, map->s_uspace.s_table->i_ino); @@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->freedSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_fspace.s_table = udf_iget(sb, &loc); - if (!map->s_fspace.s_table) { + inode = udf_iget_special(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } - + map->s_fspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", p_index, map->s_fspace.s_table->i_ino); @@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, struct udf_part_map *map = &sbi->s_partmaps[p_index]; sector_t vat_block; struct kernel_lb_addr ino; + struct inode *inode; /* * VAT file entry is in the last recorded block. Some broken disks have @@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, ino.partitionReferenceNum = type1_index; for (vat_block = start_block; vat_block >= map->s_partition_root && - vat_block >= start_block - 3 && - !sbi->s_vat_inode; vat_block--) { + vat_block >= start_block - 3; vat_block--) { ino.logicalBlockNum = vat_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + inode = udf_iget_special(sb, &ino); + if (!IS_ERR(inode)) { + sbi->s_vat_inode = inode; + break; + } } } @@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); - if (!inode) { + if (IS_ERR(inode)) { udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); - ret = -EIO; + ret = PTR_ERR(inode); goto error_out; } diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index d7c6dbe4194b..6fb7945c1e6e 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -20,7 +20,7 @@ */ #include "udfdecl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/time.h> diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index be7dabbbcb49..1cc3c993ebd0 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -138,12 +138,22 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ -extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); +extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, + bool hidden_inode); +static inline struct inode *udf_iget_special(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, true); +} +static inline struct inode *udf_iget(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, false); +} extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); extern int udf_setsize(struct inode *, loff_t); -extern void udf_read_inode(struct inode *); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); @@ -209,7 +219,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); -extern struct inode *udf_new_inode(struct inode *, umode_t, int *); +extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 1f11483eba6a..77c331f1a770 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = { /*2038*/ SPY(68, 17, 0) }; -extern struct timezone sys_tz; - #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 44b815e57f94..afd470e588ff 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int extIndex = 0, newExtIndex = 0, hasExt = 0; unsigned short valueCRC; uint8_t curr; - const uint8_t hexChar[] = "0123456789ABCDEF"; if (udfName[0] == '.' && (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { @@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, newIndex = 250; newName[newIndex++] = CRC_MARK; valueCRC = crc_itu_t(0, fidName, fidNameLen); - newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; - newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; - newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; - newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; + newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_hi(valueCRC); + newName[newIndex++] = hex_asc_upper_lo(valueCRC); if (hasExt) { newName[newIndex++] = EXT_MARK; diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index dd39980437fc..4d0e02b022b3 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ namei.o super.o symlink.o truncate.o util.o +ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index a9cc75ffa925..7caa01652888 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -298,7 +298,10 @@ cg_found: ufsi->i_oeftflag = 0; ufsi->i_dir_start_lookup = 0; memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EIO; + goto failed; + } mark_inode_dirty(inode); if (uspi->fs_magic == UFS2_MAGIC) { @@ -337,6 +340,7 @@ cg_found: fail_remove_inode: unlock_ufs(sb); clear_nlink(inode); + unlock_new_inode(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 61e8a9b021dd..be7d42c7d938 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -158,16 +158,16 @@ out: /** * ufs_inode_getfrag() - allocate new fragment(s) - * @inode - pointer to inode - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @fragment: number of `fragment' which hold pointer * to new allocated fragment(s) - * @new_fragment - number of new allocated fragment(s) - * @required - how many fragment(s) we require - * @err - we set it if something wrong - * @phys - pointer to where we save physical number of new allocated fragments, + * @new_fragment: number of new allocated fragment(s) + * @required: how many fragment(s) we require + * @err: we set it if something wrong + * @phys: pointer to where we save physical number of new allocated fragments, * NULL if we allocate not data(indirect blocks for example). - * @new - we set it if we allocate new block - * @locked_page - for ufs_new_fragments() + * @new: we set it if we allocate new block + * @locked_page: for ufs_new_fragments() */ static struct buffer_head * ufs_inode_getfrag(struct inode *inode, u64 fragment, @@ -315,16 +315,16 @@ repeat2: /** * ufs_inode_getblock() - allocate new block - * @inode - pointer to inode - * @bh - pointer to block which hold "pointer" to new allocated block - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @bh: pointer to block which hold "pointer" to new allocated block + * @fragment: number of `fragment' which hold pointer * to new allocated block - * @new_fragment - number of new allocated fragment + * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) - * @err - see ufs_inode_getfrag() - * @phys - see ufs_inode_getfrag() - * @new - see ufs_inode_getfrag() - * @locked_page - see ufs_inode_getfrag() + * @err: see ufs_inode_getfrag() + * @phys: see ufs_inode_getfrag() + * @new: see ufs_inode_getfrag() + * @locked_page: see ufs_inode_getfrag() */ static struct buffer_head * ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, @@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode) invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) { - lock_ufs(inode->i_sb); - ufs_free_inode (inode); - unlock_ufs(inode->i_sb); - } + if (want_delete) + ufs_free_inode(inode); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 90d74b8f8eba..fd65deb4b5f0 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); if (!err) { + unlock_new_inode(inode); d_instantiate(dentry, inode); return 0; } inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out; + goto out_notlocked; + lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -155,6 +157,7 @@ out_notlocked: out_fail: inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - inode = ufs_new_inode(dir, S_IFDIR|mode); - err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_dir; + return PTR_ERR(inode); inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); + lock_ufs(dir->i_sb); + inode_inc_link_count(dir); + err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -211,8 +213,8 @@ out: out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); + unlock_new_inode(inode); iput (inode); -out_dir: inode_dec_link_count(dir); unlock_ufs(dir->i_sb); goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b879f1ba3439..da73801301d5 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -65,7 +65,6 @@ * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ - #include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> @@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb, { u32 magic = fs32_to_cpu(sb, usb3->fs_magic); - printk("ufs_print_super_stuff\n"); - printk(" magic: 0x%x\n", magic); + pr_debug("ufs_print_super_stuff\n"); + pr_debug(" magic: 0x%x\n", magic); if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { - printk(" fs_size: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); - printk(" fs_dsize: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); - printk(" bsize: %u\n", - fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", - fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); - printk(" fs_sblockloc: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); - printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); - printk(" cs_nbfree(No of free blocks): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); - printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); - printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); - printk(KERN_INFO" fs_maxsymlinklen: %u\n", - fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); + pr_debug(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + pr_debug(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + pr_debug(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + pr_debug(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + pr_info(" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + pr_info(" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); + pr_info(" fs_maxsymlinklen: %u\n", + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); } else { - printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); - printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); - printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); - printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); - printk(" cgoffset: %u\n", - fs32_to_cpu(sb, usb1->fs_cgoffset)); - printk(" ~cgmask: 0x%x\n", - ~fs32_to_cpu(sb, usb1->fs_cgmask)); - printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); - printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); - printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); - printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); - printk(" fragshift: %u\n", - fs32_to_cpu(sb, usb1->fs_fragshift)); - printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); - printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); - printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); - printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); - printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); - printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); - printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); - printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); - printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); - printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); - printk(" fstodb: %u\n", - fs32_to_cpu(sb, usb1->fs_fsbtodb)); - printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); - printk(" ndir %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); - printk(" nifree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); - printk(" nbfree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); - printk(" nffree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + pr_debug(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + pr_debug(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + pr_debug(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + pr_debug(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + pr_debug(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + pr_debug(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + pr_debug(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + pr_debug(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); } - printk("\n"); + pr_debug("\n"); } /* @@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb, static void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) { - printk("\nufs_print_cylinder_stuff\n"); - printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); - printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); - printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); - printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); - printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); - printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); - printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); - printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); - printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); - printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); - printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); - printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); - printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); - printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); - printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", + pr_debug("\nufs_print_cylinder_stuff\n"); + pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); + pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); + pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); + pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); + pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); + pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); + pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); + pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); + pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); + pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); + pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); + pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); + pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); + pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); + pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); - printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); - printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); - printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); - printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); - printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); - printk(" clustersumoff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); - printk(" clusteroff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); - printk(" nclusterblks %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); - printk("\n"); + pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); + pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); + pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); + pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); + pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); + pr_debug(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + pr_debug(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + pr_debug(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + pr_debug("\n"); } #else # define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ @@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb, static const struct super_operations ufs_super_ops; -static char error_buf[1024]; - void ufs_error (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function, ufs_mark_sb_dirty(sb); sb->s_flags |= MS_RDONLY; } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { case UFS_MOUNT_ONERROR_PANIC: - panic ("UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + panic("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); case UFS_MOUNT_ONERROR_LOCK: case UFS_MOUNT_ONERROR_UMOUNT: case UFS_MOUNT_ONERROR_REPAIR: - printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); - } + pr_crit("error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + } + va_end(args); } void ufs_panic (struct super_block * sb, const char * function, @@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function, { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function, ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; sb->s_flags |= MS_RDONLY; - printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + pr_crit("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } void ufs_warning (struct super_block * sb, const char * function, const char * fmt, ...) { + struct va_format vaf; va_list args; - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); - printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n", - sb->s_id, function, error_buf); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } enum { @@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options) ufs_set_opt (*mount_options, ONERROR_UMOUNT); break; case Opt_onerror_repair: - printk("UFS-fs: Unable to do repair on error, " - "will lock lock instead\n"); + pr_err("Unable to do repair on error, will lock lock instead\n"); ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_REPAIR); break; default: - printk("UFS-fs: Invalid option: \"%s\" " - "or missing value\n", p); + pr_err("Invalid option: \"%s\" or missing value\n", p); return 0; } } @@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) #ifndef CONFIG_UFS_FS_WRITE if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } #endif @@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { - printk("wrong mount options\n"); + pr_err("wrong mount options\n"); goto failed; } if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { if (!silent) - printk("You didn't specify the type of your ufs filesystem\n\n" + pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " @@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_SUNOS: - UFSD(("ufstype=sunos\n")) + UFSD("ufstype=sunos\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=old is supported read-only\n"); + pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep is supported read-only\n"); + pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n"); + pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=openstep is supported read-only\n"); + pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=hp is supported read-only\n"); + pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; default: if (!silent) - printk("unknown ufstype\n"); + pr_err("unknown ufstype\n"); goto failed; } again: if (!sb_set_blocksize(sb, block_size)) { - printk(KERN_ERR "UFS: failed to set blocksize\n"); + pr_err("failed to set blocksize\n"); goto failed; } @@ -1034,7 +1034,7 @@ again: goto again; } if (!silent) - printk("ufs_read_super: bad magic number\n"); + pr_err("%s(): bad magic number\n", __func__); goto failed; magic_found: @@ -1048,33 +1048,33 @@ magic_found: uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); if (!is_power_of_2(uspi->s_fsize)) { - printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", - uspi->s_fsize); - goto failed; + pr_err("%s(): fragment size %u is not a power of 2\n", + __func__, uspi->s_fsize); + goto failed; } if (uspi->s_fsize < 512) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too small\n", + __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize > 4096) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too large\n", + __func__, uspi->s_fsize); goto failed; } if (!is_power_of_2(uspi->s_bsize)) { - printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", - uspi->s_bsize); + pr_err("%s(): block size %u is not a power of 2\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize < 4096) { - printk(KERN_ERR "ufs_read_super: block size %u is too small\n", - uspi->s_bsize); + pr_err("%s(): block size %u is too small\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize / uspi->s_fsize > 8) { - printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n", - uspi->s_bsize / uspi->s_fsize); + pr_err("%s(): too many fragments per block (%u)\n", + __func__, uspi->s_bsize / uspi->s_fsize); goto failed; } if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { @@ -1113,20 +1113,21 @@ magic_found: UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: - printk("ufs_read_super: fs is active\n"); + pr_err("%s(): fs is active\n", __func__); sb->s_flags |= MS_RDONLY; break; case UFS_FSBAD: - printk("ufs_read_super: fs is bad\n"); + pr_err("%s(): fs is bad\n", __func__); sb->s_flags |= MS_RDONLY; break; default: - printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean); + pr_err("%s(): can't grok fs_clean 0x%x\n", + __func__, usb1->fs_clean); sb->s_flags |= MS_RDONLY; break; } } else { - printk("ufs_read_super: fs needs fsck\n"); + pr_err("%s(): fs needs fsck\n", __func__); sb->s_flags |= MS_RDONLY; } @@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { - printk("ufstype can't be changed during remount\n"); + pr_err("ufstype can't be changed during remount\n"); unlock_ufs(sb); return -EINVAL; } @@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) * fs was mounted as ro, remounting rw */ #ifndef CONFIG_UFS_FS_WRITE - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); unlock_ufs(sb); return -EINVAL; #else @@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_44BSD && ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { - printk("this ufstype is read-only supported\n"); + pr_err("this ufstype is read-only supported\n"); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { - printk("failed during remounting\n"); + pr_err("failed during remounting\n"); unlock_ufs(sb); return -EPERM; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 343e6fc571e5..2a07396d5f9e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -1,6 +1,12 @@ #ifndef _UFS_UFS_H #define _UFS_UFS_H 1 +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define UFS_MAX_GROUP_LOADED 8 #define UFS_CGNO_EMPTY ((unsigned)-1) @@ -71,9 +77,9 @@ struct ufs_inode_info { */ #ifdef CONFIG_UFS_DEBUG # define UFSD(f, a...) { \ - printk ("UFSD (%s, %d): %s:", \ + pr_debug("UFSD (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ - printk (f, ## a); \ + pr_debug(f, ## a); \ } #else # define UFSD(f, a...) /**/ diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 399e8cec6e60..5d47b4df61ea 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,6 +1,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK + depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C help diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c21f43506661..d61799949580 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -17,6 +17,7 @@ # ccflags-y += -I$(src) # needed for trace events +ccflags-y += -I$(src)/libxfs ccflags-$(CONFIG_XFS_DEBUG) += -g @@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS) += xfs.o # this one should be compiled first, as the tracing macros can easily blow up xfs-y += xfs_trace.o +# build the libxfs code first +xfs-y += $(addprefix libxfs/, \ + xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_attr_remote.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_da_btree.o \ + xfs_da_format.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dquot_buf.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o \ + ) +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ + xfs_rtbitmap.o \ + ) + # highlevel code xfs-y += xfs_aops.o \ xfs_attr_inactive.o \ @@ -45,53 +79,27 @@ xfs-y += xfs_aops.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ + xfs_inode.o \ xfs_itable.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ xfs_super.o \ xfs_symlink.o \ + xfs_sysfs.o \ xfs_trans.o \ xfs_xattr.o \ kmem.o \ uuid.o -# code shared with libxfs -xfs-y += xfs_alloc.o \ - xfs_alloc_btree.o \ - xfs_attr.o \ - xfs_attr_leaf.o \ - xfs_attr_remote.o \ - xfs_bmap.o \ - xfs_bmap_btree.o \ - xfs_btree.o \ - xfs_da_btree.o \ - xfs_da_format.o \ - xfs_dir2.o \ - xfs_dir2_block.o \ - xfs_dir2_data.o \ - xfs_dir2_leaf.o \ - xfs_dir2_node.o \ - xfs_dir2_sf.o \ - xfs_dquot_buf.o \ - xfs_ialloc.o \ - xfs_ialloc_btree.o \ - xfs_icreate_item.o \ - xfs_inode.o \ - xfs_inode_fork.o \ - xfs_inode_buf.o \ - xfs_log_recover.o \ - xfs_log_rlimit.o \ - xfs_sb.o \ - xfs_symlink_remote.o \ - xfs_trans_resv.o - # low-level transaction/log code xfs-y += xfs_log.o \ xfs_log_cil.o \ xfs_buf_item.o \ xfs_extfree_item.o \ + xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_log_recover.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ @@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ - xfs_rtbitmap.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 6e247a99f5db..6e247a99f5db 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d43813267a80..4bffffe038a1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -483,9 +483,9 @@ xfs_agfl_read_verify( return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_agfl_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -503,7 +503,7 @@ xfs_agfl_write_verify( return; if (!xfs_agfl_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -559,7 +559,7 @@ xfs_alloc_update_counters( xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) - return EFSCORRUPTED; + return -EFSCORRUPTED; xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); return 0; @@ -2234,11 +2234,11 @@ xfs_agf_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, XFS_ERRTAG_ALLOC_READ_AGF, XFS_RANDOM_ALLOC_READ_AGF)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -2252,7 +2252,7 @@ xfs_agf_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agf_verify(mp, bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -2601,11 +2601,11 @@ xfs_free_extent( */ args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) - return EFSCORRUPTED; + return -EFSCORRUPTED; args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); if (args.agbno >= args.mp->m_sb.sb_agblocks) - return EFSCORRUPTED; + return -EFSCORRUPTED; args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); @@ -2617,7 +2617,7 @@ xfs_free_extent( /* validate the extent size is legal now we have the agf locked */ if (args.agbno + len > be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto error0; } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index feacb061bab7..feacb061bab7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8358f1ded94d..e0e83e24d3ef 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -355,9 +355,9 @@ xfs_allocbt_read_verify( struct xfs_buf *bp) { if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_allocbt_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) { trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -371,7 +371,7 @@ xfs_allocbt_write_verify( { if (!xfs_allocbt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45e189e7e81c..45e189e7e81c 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index bfe36fc2cdc2..353fb425faef 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -85,7 +85,7 @@ xfs_attr_args_init( { if (!name) - return EINVAL; + return -EINVAL; memset(args, 0, sizeof(*args)); args->geo = dp->i_mount->m_attr_geo; @@ -95,7 +95,7 @@ xfs_attr_args_init( args->name = name; args->namelen = strlen((const char *)name); if (args->namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + return -EFAULT; /* match IRIX behaviour */ args->hashval = xfs_da_hashname(args->name, args->namelen); return 0; @@ -131,10 +131,10 @@ xfs_attr_get( XFS_STATS_INC(xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return EIO; + return -EIO; if (!xfs_inode_hasattr(ip)) - return ENOATTR; + return -ENOATTR; error = xfs_attr_args_init(&args, ip, name, flags); if (error) @@ -145,7 +145,7 @@ xfs_attr_get( lock_mode = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_hasattr(ip)) - error = ENOATTR; + error = -ENOATTR; else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) error = xfs_attr_shortform_getvalue(&args); else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) @@ -155,7 +155,7 @@ xfs_attr_get( xfs_iunlock(ip, lock_mode); *valuelenp = args.valuelen; - return error == EEXIST ? 0 : error; + return error == -EEXIST ? 0 : error; } /* @@ -213,7 +213,7 @@ xfs_attr_set( XFS_STATS_INC(xs_attr_set); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; + return -EIO; error = xfs_attr_args_init(&args, dp, name, flags); if (error) @@ -304,7 +304,7 @@ xfs_attr_set( * the inode. */ error = xfs_attr_shortform_addname(&args); - if (error != ENOSPC) { + if (error != -ENOSPC) { /* * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). @@ -419,10 +419,10 @@ xfs_attr_remove( XFS_STATS_INC(xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; + return -EIO; if (!xfs_inode_hasattr(dp)) - return ENOATTR; + return -ENOATTR; error = xfs_attr_args_init(&args, dp, name, flags); if (error) @@ -477,7 +477,7 @@ xfs_attr_remove( xfs_trans_ijoin(args.trans, dp, 0); if (!xfs_inode_hasattr(dp)) { - error = XFS_ERROR(ENOATTR); + error = -ENOATTR; } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(&args); @@ -534,28 +534,28 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) trace_xfs_attr_sf_addname(args); retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { - return(retval); - } else if (retval == EEXIST) { + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + return retval; + } else if (retval == -EEXIST) { if (args->flags & ATTR_CREATE) - return(retval); + return retval; retval = xfs_attr_shortform_remove(args); ASSERT(retval == 0); } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) - return(XFS_ERROR(ENOSPC)); + return -ENOSPC; newsize = XFS_ATTR_SF_TOTSIZE(args->dp); newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) - return(XFS_ERROR(ENOSPC)); + return -ENOSPC; xfs_attr_shortform_add(args, forkoff); - return(0); + return 0; } @@ -592,10 +592,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * the given flags produce an error or call for an atomic rename. */ retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { xfs_trans_brelse(args->trans, bp); return retval; - } else if (retval == EEXIST) { + } else if (retval == -EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ xfs_trans_brelse(args->trans, bp); return retval; @@ -626,7 +626,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * if required. */ retval = xfs_attr3_leaf_add(bp, args); - if (retval == ENOSPC) { + if (retval == -ENOSPC) { /* * Promote the attribute list to the Btree format, then * Commit that transaction so that the node_addname() call @@ -642,7 +642,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -658,13 +658,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, dp); if (error) - return (error); + return error; /* * Fob the whole rest of the problem off on the Btree code. */ error = xfs_attr_node_addname(args); - return(error); + return error; } /* @@ -673,7 +673,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, dp); if (error) - return (error); + return error; /* * If there was an out-of-line value, allocate the blocks we @@ -684,7 +684,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) if (args->rmtblkno > 0) { error = xfs_attr_rmtval_set(args); if (error) - return(error); + return error; } /* @@ -700,7 +700,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ error = xfs_attr3_leaf_flipflags(args); if (error) - return(error); + return error; /* * Dismantle the "old" attribute/value pair by removing @@ -714,7 +714,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) - return(error); + return error; } /* @@ -744,7 +744,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -795,7 +795,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) return error; error = xfs_attr3_leaf_lookup_int(bp, args); - if (error == ENOATTR) { + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; } @@ -850,7 +850,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != EEXIST) { + if (error != -EEXIST) { xfs_trans_brelse(args->trans, bp); return error; } @@ -906,9 +906,9 @@ restart: goto out; blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { goto out; - } else if (retval == EEXIST) { + } else if (retval == -EEXIST) { if (args->flags & ATTR_CREATE) goto out; @@ -933,7 +933,7 @@ restart: } retval = xfs_attr3_leaf_add(blk->bp, state->args); - if (retval == ENOSPC) { + if (retval == -ENOSPC) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had @@ -1031,7 +1031,7 @@ restart: if (args->rmtblkno > 0) { error = xfs_attr_rmtval_set(args); if (error) - return(error); + return error; } /* @@ -1061,7 +1061,7 @@ restart: if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) - return(error); + return error; } /* @@ -1134,8 +1134,8 @@ out: if (state) xfs_da_state_free(state); if (error) - return(error); - return(retval); + return error; + return retval; } /* @@ -1168,7 +1168,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); - if (error || (retval != EEXIST)) { + if (error || (retval != -EEXIST)) { if (error == 0) error = retval; goto out; @@ -1297,7 +1297,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) out: xfs_da_state_free(state); - return(error); + return error; } /* @@ -1345,7 +1345,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) } } - return(0); + return 0; } /* @@ -1376,7 +1376,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); if (error) - return(error); + return error; } else { blk->bp = NULL; } @@ -1395,13 +1395,13 @@ xfs_attr_refillstate(xfs_da_state_t *state) blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); if (error) - return(error); + return error; } else { blk->bp = NULL; } } - return(0); + return 0; } /* @@ -1431,7 +1431,7 @@ xfs_attr_node_get(xfs_da_args_t *args) error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; - } else if (retval == EEXIST) { + } else if (retval == -EEXIST) { blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->bp != NULL); ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1455,5 +1455,5 @@ xfs_attr_node_get(xfs_da_args_t *args) } xfs_da_state_free(state); - return(retval); + return retval; } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 28712d29e43c..b1f73dbbf3d8 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -214,7 +214,7 @@ xfs_attr3_leaf_write_verify( struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_attr3_leaf_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -242,9 +242,9 @@ xfs_attr3_leaf_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_attr3_leaf_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -547,7 +547,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) break; } if (i == end) - return(XFS_ERROR(ENOATTR)); + return -ENOATTR; /* * Fix up the attribute fork data, covering the hole @@ -582,7 +582,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_sbversion_add_attr2(mp, args->trans); - return(0); + return 0; } /* @@ -611,9 +611,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) continue; if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; - return(XFS_ERROR(EEXIST)); + return -EEXIST; } - return(XFS_ERROR(ENOATTR)); + return -ENOATTR; } /* @@ -640,18 +640,18 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (args->flags & ATTR_KERNOVAL) { args->valuelen = sfe->valuelen; - return(XFS_ERROR(EEXIST)); + return -EEXIST; } if (args->valuelen < sfe->valuelen) { args->valuelen = sfe->valuelen; - return(XFS_ERROR(ERANGE)); + return -ERANGE; } args->valuelen = sfe->valuelen; memcpy(args->value, &sfe->nameval[args->namelen], args->valuelen); - return(XFS_ERROR(EEXIST)); + return -EEXIST; } - return(XFS_ERROR(ENOATTR)); + return -ENOATTR; } /* @@ -691,7 +691,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) * If we hit an IO error middle of the transaction inside * grow_inode(), we may have inconsistent data. Bail out. */ - if (error == EIO) + if (error == -EIO) goto out; xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ @@ -730,9 +730,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ - ASSERT(error == ENOATTR); + ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); if (error) goto out; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); @@ -741,7 +741,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) out: kmem_free(tmpbuffer); - return(error); + return error; } /* @@ -769,12 +769,12 @@ xfs_attr_shortform_allfit( if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) - return(0); + return 0; name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) - return(0); + return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) - return(0); + return 0; bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); @@ -809,7 +809,7 @@ xfs_attr3_leaf_to_shortform( tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); if (!tmpbuffer) - return ENOMEM; + return -ENOMEM; memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); @@ -1017,10 +1017,10 @@ xfs_attr3_leaf_split( ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); error = xfs_da_grow_inode(state->args, &blkno); if (error) - return(error); + return error; error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) - return(error); + return error; newblk->blkno = blkno; newblk->magic = XFS_ATTR_LEAF_MAGIC; @@ -1031,7 +1031,7 @@ xfs_attr3_leaf_split( xfs_attr3_leaf_rebalance(state, oldblk, newblk); error = xfs_da3_blk_link(state, oldblk, newblk); if (error) - return(error); + return error; /* * Save info on "old" attribute for "atomic rename" ops, leaf_add() @@ -1053,7 +1053,7 @@ xfs_attr3_leaf_split( */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); - return(error); + return error; } /* @@ -1108,7 +1108,7 @@ xfs_attr3_leaf_add( * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) - return XFS_ERROR(ENOSPC); + return -ENOSPC; /* * Compact the entries to coalesce free space. @@ -1121,7 +1121,7 @@ xfs_attr3_leaf_add( * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { - tmp = ENOSPC; + tmp = -ENOSPC; goto out_log_hdr; } @@ -1692,7 +1692,7 @@ xfs_attr3_leaf_toosmall( ichdr.usedbytes; if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ - return(0); + return 0; } /* @@ -1711,7 +1711,7 @@ xfs_attr3_leaf_toosmall( error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) - return(error); + return error; if (retval) { *action = 0; } else { @@ -1740,7 +1740,7 @@ xfs_attr3_leaf_toosmall( error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) - return(error); + return error; xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); @@ -1757,7 +1757,7 @@ xfs_attr3_leaf_toosmall( } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -1773,13 +1773,13 @@ xfs_attr3_leaf_toosmall( 0, &retval); } if (error) - return(error); + return error; if (retval) { *action = 0; } else { *action = 1; } - return(0); + return 0; } /* @@ -2123,7 +2123,7 @@ xfs_attr3_leaf_lookup_int( } if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return XFS_ERROR(ENOATTR); + return -ENOATTR; } /* @@ -2152,7 +2152,7 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return XFS_ERROR(EEXIST); + return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) @@ -2168,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, args->rmtvaluelen); - return XFS_ERROR(EEXIST); + return -EEXIST; } } args->index = probe; - return XFS_ERROR(ENOATTR); + return -ENOATTR; } /* @@ -2208,7 +2208,7 @@ xfs_attr3_leaf_getvalue( } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return XFS_ERROR(ERANGE); + return -ERANGE; } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); @@ -2226,7 +2226,7 @@ xfs_attr3_leaf_getvalue( } if (args->valuelen < args->rmtvaluelen) { args->valuelen = args->rmtvaluelen; - return XFS_ERROR(ERANGE); + return -ERANGE; } args->valuelen = args->rmtvaluelen; } @@ -2481,7 +2481,7 @@ xfs_attr3_leaf_clearflag( */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); + return error; leaf = bp->b_addr; entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; @@ -2548,7 +2548,7 @@ xfs_attr3_leaf_setflag( */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); + return error; leaf = bp->b_addr; #ifdef DEBUG diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index e2929da7c3ba..e2929da7c3ba 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index b5adfecbb8ee..7510ab8058a4 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -138,11 +138,11 @@ xfs_attr3_rmt_read_verify( while (len > 0) { if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); break; } if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); break; } len -= blksize; @@ -178,7 +178,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -257,7 +257,7 @@ xfs_attr_rmtval_copyout( xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", bno, *offset, byte_cnt, ino); - return EFSCORRUPTED; + return -EFSCORRUPTED; } hdr_size = sizeof(struct xfs_attr3_rmt_hdr); } @@ -452,7 +452,7 @@ xfs_attr_rmtval_set( ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -473,7 +473,7 @@ xfs_attr_rmtval_set( */ error = xfs_trans_roll(&args->trans, dp); if (error) - return (error); + return error; } /* @@ -498,7 +498,7 @@ xfs_attr_rmtval_set( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) - return(error); + return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -508,7 +508,7 @@ xfs_attr_rmtval_set( bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) - return ENOMEM; + return -ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, @@ -563,7 +563,7 @@ xfs_attr_rmtval_remove( error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) - return(error); + return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -622,7 +622,7 @@ xfs_attr_rmtval_remove( */ error = xfs_trans_roll(&args->trans, args->dp); if (error) - return (error); + return error; } - return(0); + return 0; } diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 5a9acfa156d7..5a9acfa156d7 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 919756e3ba53..919756e3ba53 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index e1649c0d3e02..e1649c0d3e02 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 75c3fe5f3d9d..86df952d3e24 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -392,7 +392,7 @@ xfs_bmap_check_leaf_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); + ASSERT(bno != NULLFSBLOCK); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); @@ -1033,7 +1033,7 @@ xfs_bmap_add_attrfork_btree( goto error0; if (stat == 0) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - return XFS_ERROR(ENOSPC); + return -ENOSPC; } *firstblock = cur->bc_private.b.firstblock; cur->bc_private.b.allocated = 0; @@ -1115,7 +1115,7 @@ xfs_bmap_add_attrfork_local( /* should only be called for types that support local format data */ ASSERT(0); - return EFSCORRUPTED; + return -EFSCORRUPTED; } /* @@ -1192,7 +1192,7 @@ xfs_bmap_add_attrfork( break; default: ASSERT(0); - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto trans_cancel; } @@ -1299,7 +1299,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); + ASSERT(bno != NULLFSBLOCK); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); /* @@ -1399,7 +1399,7 @@ xfs_bmap_read_extents( return 0; error0: xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } @@ -1429,11 +1429,7 @@ xfs_bmap_search_multi_extents( gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif prevp->br_startoff = NULLFILEOFF; ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); @@ -1576,7 +1572,7 @@ xfs_bmap_last_before( if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); + return -EIO; if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { *last_block = 0; return 0; @@ -1690,7 +1686,7 @@ xfs_bmap_last_offset( if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return XFS_ERROR(EIO); + return -EIO; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -3323,7 +3319,7 @@ xfs_bmap_extsize_align( if (orig_off < align_off || orig_end > align_off + align_alen || align_alen - temp < orig_alen) - return XFS_ERROR(EINVAL); + return -EINVAL; /* * Try to fix it by moving the start up. */ @@ -3348,7 +3344,7 @@ xfs_bmap_extsize_align( * Result doesn't cover the request, fail it. */ if (orig_off < align_off || orig_end > align_off + align_alen) - return XFS_ERROR(EINVAL); + return -EINVAL; } else { ASSERT(orig_off >= align_off); ASSERT(orig_end <= align_off + align_alen); @@ -4051,11 +4047,11 @@ xfs_bmapi_read( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; XFS_STATS_INC(xs_blk_mapr); @@ -4246,11 +4242,11 @@ xfs_bmapi_delay( XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; XFS_STATS_INC(xs_blk_mapw); @@ -4469,7 +4465,7 @@ xfs_bmapi_convert_unwritten( * so generate another request. */ if (mval->br_blockcount < len) - return EAGAIN; + return -EAGAIN; return 0; } @@ -4540,11 +4536,11 @@ xfs_bmapi_write( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -4620,7 +4616,7 @@ xfs_bmapi_write( /* Execute unwritten extent conversion if necessary */ error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); - if (error == EAGAIN) + if (error == -EAGAIN) continue; if (error) goto error0; @@ -4922,7 +4918,7 @@ xfs_bmap_del_extent( goto done; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); - if (error && error != ENOSPC) + if (error && error != -ENOSPC) goto done; /* * If get no-space back from btree insert, @@ -4930,7 +4926,7 @@ xfs_bmap_del_extent( * block reservation. * Fix up our state and return the error. */ - if (error == ENOSPC) { + if (error == -ENOSPC) { /* * Reset the cursor, don't trust * it after any insert operation. @@ -4958,7 +4954,7 @@ xfs_bmap_del_extent( xfs_bmbt_set_blockcount(ep, got.br_blockcount); flags = 0; - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto done; } XFS_WANT_CORRUPTED_GOTO(i == 1, done); @@ -5076,11 +5072,11 @@ xfs_bunmapi( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); @@ -5325,7 +5321,7 @@ xfs_bunmapi( del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto error0; } error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, @@ -5428,7 +5424,7 @@ xfs_bmap_shift_extents( struct xfs_bmap_free *flist, int num_exts) { - struct xfs_btree_cur *cur; + struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec left; @@ -5439,7 +5435,7 @@ xfs_bmap_shift_extents( int error = 0; int i; int whichfork = XFS_DATA_FORK; - int logflags; + int logflags = 0; xfs_filblks_t blockcount = 0; int total_extents; @@ -5449,11 +5445,11 @@ xfs_bmap_shift_extents( mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmap_shift_extents", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; ASSERT(current_ext != NULL); @@ -5482,16 +5478,11 @@ xfs_bmap_shift_extents( } } - /* We are going to change core inode */ - logflags = XFS_ILOG_CORE; if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; cur->bc_private.b.flags = 0; - } else { - cur = NULL; - logflags |= XFS_ILOG_DEXT; } /* @@ -5516,14 +5507,14 @@ xfs_bmap_shift_extents( *current_ext - 1), &left); if (startoff < left.br_startoff + left.br_blockcount) - error = XFS_ERROR(EINVAL); + error = -EINVAL; } else if (offset_shift_fsb > got.br_startoff) { /* * When first extent is shifted, offset_shift_fsb * should be less than the stating offset of * the first extent. */ - error = XFS_ERROR(EINVAL); + error = -EINVAL; } if (error) @@ -5549,11 +5540,14 @@ xfs_bmap_shift_extents( blockcount = left.br_blockcount + got.br_blockcount; xfs_iext_remove(ip, *current_ext, 1, 0); + logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_btree_delete(cur, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } else { + logflags |= XFS_ILOG_DEXT; } XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); @@ -5579,6 +5573,7 @@ xfs_bmap_shift_extents( got.br_startoff = startoff; } + logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, @@ -5586,6 +5581,8 @@ xfs_bmap_shift_extents( got.br_state); if (error) goto del_cursor; + } else { + logflags |= XFS_ILOG_DEXT; } (*current_ext)++; @@ -5601,6 +5598,7 @@ del_cursor: xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - xfs_trans_log_inode(tp, ip, logflags); + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); return error; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b879ca56a64c..b879ca56a64c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 948836c4fd90..fba753308f31 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -111,23 +111,8 @@ __xfs_bmbt_get_all( ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); s->br_startoff = ((xfs_fileoff_t)l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; -#if XFS_BIG_BLKNOS s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | (((xfs_fsblock_t)l1) >> 21); -#else -#ifdef DEBUG - { - xfs_dfsbno_t b; - - b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | - (((xfs_dfsbno_t)l1) >> 21); - ASSERT((b >> 32) == 0 || isnulldstartblock(b)); - s->br_startblock = (xfs_fsblock_t)b; - } -#else /* !DEBUG */ - s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); -#endif /* DEBUG */ -#endif /* XFS_BIG_BLKNOS */ s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); /* This is xfs_extent_state() in-line */ if (ext_flag) { @@ -163,21 +148,8 @@ xfs_fsblock_t xfs_bmbt_get_startblock( xfs_bmbt_rec_host_t *r) { -#if XFS_BIG_BLKNOS return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | (((xfs_fsblock_t)r->l1) >> 21); -#else -#ifdef DEBUG - xfs_dfsbno_t b; - - b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | - (((xfs_dfsbno_t)r->l1) >> 21); - ASSERT((b >> 32) == 0 || isnulldstartblock(b)); - return (xfs_fsblock_t)b; -#else /* !DEBUG */ - return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); -#endif /* DEBUG */ -#endif /* XFS_BIG_BLKNOS */ } /* @@ -241,7 +213,6 @@ xfs_bmbt_set_allf( ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); -#if XFS_BIG_BLKNOS ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | @@ -250,23 +221,6 @@ xfs_bmbt_set_allf( r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); -#else /* !XFS_BIG_BLKNOS */ - if (isnullstartblock(startblock)) { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - (xfs_bmbt_rec_base_t)xfs_mask64lo(9); - r->l1 = xfs_mask64hi(11) | - ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); - } else { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9); - r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); - } -#endif /* XFS_BIG_BLKNOS */ } /* @@ -298,8 +252,6 @@ xfs_bmbt_disk_set_allf( ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - -#if XFS_BIG_BLKNOS ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); r->l0 = cpu_to_be64( @@ -310,26 +262,6 @@ xfs_bmbt_disk_set_allf( ((xfs_bmbt_rec_base_t)startblock << 21) | ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); -#else /* !XFS_BIG_BLKNOS */ - if (isnullstartblock(startblock)) { - r->l0 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); - r->l1 = cpu_to_be64(xfs_mask64hi(11) | - ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); - } else { - r->l0 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9)); - r->l1 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); - } -#endif /* XFS_BIG_BLKNOS */ } /* @@ -365,24 +297,11 @@ xfs_bmbt_set_startblock( xfs_bmbt_rec_host_t *r, xfs_fsblock_t v) { -#if XFS_BIG_BLKNOS ASSERT((v & xfs_mask64hi(12)) == 0); r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | (xfs_bmbt_rec_base_t)(v >> 43); r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | (xfs_bmbt_rec_base_t)(v << 21); -#else /* !XFS_BIG_BLKNOS */ - if (isnullstartblock(v)) { - r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); - r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | - ((xfs_bmbt_rec_base_t)v << 21) | - (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); - } else { - r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9); - r->l1 = ((xfs_bmbt_rec_base_t)v << 21) | - (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); - } -#endif /* XFS_BIG_BLKNOS */ } /* @@ -438,8 +357,8 @@ xfs_bmbt_to_bmdr( cpu_to_be64(XFS_BUF_DADDR_NULL)); } else ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); - ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); - ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; @@ -554,7 +473,7 @@ xfs_bmbt_alloc_block( args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto error0; } error = xfs_alloc_vextent(&args); @@ -763,11 +682,11 @@ xfs_bmbt_verify( /* sibling pointer verification */ if (!block->bb_u.l.bb_leftsib || - (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) return false; if (!block->bb_u.l.bb_rightsib || - (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) return false; @@ -779,9 +698,9 @@ xfs_bmbt_read_verify( struct xfs_buf *bp) { if (!xfs_btree_lblock_verify_crc(bp)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_bmbt_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) { trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -795,7 +714,7 @@ xfs_bmbt_write_verify( { if (!xfs_bmbt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -959,7 +878,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) - return ENOMEM; + return -ENOMEM; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 819a8a4dee95..819a8a4dee95 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index cf893bc1e373..8fe6a93ff473 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -78,11 +78,11 @@ xfs_btree_check_lblock( be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))); @@ -92,7 +92,7 @@ xfs_btree_check_lblock( if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -140,7 +140,7 @@ xfs_btree_check_sblock( if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -167,12 +167,12 @@ xfs_btree_check_block( int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_dfsbno_t bno, /* btree block disk address */ + xfs_fsblock_t bno, /* btree block disk address */ int level) /* btree block level */ { XFS_WANT_CORRUPTED_RETURN( level > 0 && - bno != NULLDFSBNO && + bno != NULLFSBLOCK && XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); return 0; } @@ -595,7 +595,7 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); xfs_btree_check_block(cur, block, level, bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO); + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } @@ -771,16 +771,16 @@ xfs_btree_readahead_lblock( struct xfs_btree_block *block) { int rval = 0; - xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); - xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); - if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { xfs_btree_reada_bufl(cur->bc_mp, left, 1, cur->bc_ops->buf_ops); rval++; } - if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { xfs_btree_reada_bufl(cur->bc_mp, right, 1, cur->bc_ops->buf_ops); rval++; @@ -852,7 +852,7 @@ xfs_btree_ptr_to_daddr( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); + ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK)); return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); } else { @@ -900,9 +900,9 @@ xfs_btree_setbuf( b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)) + if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; - if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)) + if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } else { if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) @@ -918,7 +918,7 @@ xfs_btree_ptr_is_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return ptr->l == cpu_to_be64(NULLDFSBNO); + return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); } @@ -929,7 +929,7 @@ xfs_btree_set_ptr_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - ptr->l = cpu_to_be64(NULLDFSBNO); + ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); } @@ -997,8 +997,8 @@ xfs_btree_init_block_int( buf->bb_numrecs = cpu_to_be16(numrecs); if (flags & XFS_BTREE_LONG_PTRS) { - buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); @@ -1140,7 +1140,7 @@ xfs_btree_get_buf_block( mp->m_bsize, flags); if (!*bpp) - return ENOMEM; + return -ENOMEM; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); @@ -1498,7 +1498,7 @@ xfs_btree_increment( if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) goto out0; ASSERT(0); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); @@ -1597,7 +1597,7 @@ xfs_btree_decrement( if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) goto out0; ASSERT(0); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); @@ -4018,7 +4018,7 @@ xfs_btree_block_change_owner( /* now read rh sibling block for next iteration */ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &rptr)) - return ENOENT; + return -ENOENT; return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4061,7 +4061,7 @@ xfs_btree_change_owner( buffer_list); } while (!error); - if (error != ENOENT) + if (error != -ENOENT) return error; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index a04b69422f67..8f18bab73ea5 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -258,7 +258,7 @@ xfs_btree_check_block( int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_dfsbno_t ptr, /* btree block disk address */ + xfs_fsblock_t ptr, /* btree block disk address */ int level); /* btree block level */ /* diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..fad1676ad8cd 100644 --- a/fs/xfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index a514ab616650..2c42ae28d027 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -185,7 +185,7 @@ xfs_da3_node_write_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -214,13 +214,13 @@ xfs_da3_node_read_verify( switch (be16_to_cpu(info->magic)) { case XFS_DA3_NODE_MAGIC: if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); break; } /* fall through */ case XFS_DA_NODE_MAGIC: if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); break; } return; @@ -315,7 +315,7 @@ xfs_da3_node_create( error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); if (error) - return(error); + return error; bp->b_ops = &xfs_da3_node_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; @@ -337,7 +337,7 @@ xfs_da3_node_create( XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); *bpp = bp; - return(0); + return 0; } /* @@ -385,8 +385,8 @@ xfs_da3_split( switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: error = xfs_attr3_leaf_split(state, oldblk, newblk); - if ((error != 0) && (error != ENOSPC)) { - return(error); /* GROT: attr is inconsistent */ + if ((error != 0) && (error != -ENOSPC)) { + return error; /* GROT: attr is inconsistent */ } if (!error) { addblk = newblk; @@ -408,7 +408,7 @@ xfs_da3_split( &state->extrablk); } if (error) - return(error); /* GROT: attr inconsistent */ + return error; /* GROT: attr inconsistent */ addblk = newblk; break; case XFS_DIR2_LEAFN_MAGIC: @@ -422,7 +422,7 @@ xfs_da3_split( max - i, &action); addblk->bp = NULL; if (error) - return(error); /* GROT: dir is inconsistent */ + return error; /* GROT: dir is inconsistent */ /* * Record the newly split block for the next time thru? */ @@ -439,7 +439,7 @@ xfs_da3_split( xfs_da3_fixhashpath(state, &state->path); } if (!addblk) - return(0); + return 0; /* * Split the root node. @@ -449,7 +449,7 @@ xfs_da3_split( error = xfs_da3_root_split(state, oldblk, addblk); if (error) { addblk->bp = NULL; - return(error); /* GROT: dir is inconsistent */ + return error; /* GROT: dir is inconsistent */ } /* @@ -492,7 +492,7 @@ xfs_da3_split( sizeof(node->hdr.info))); } addblk->bp = NULL; - return(0); + return 0; } /* @@ -670,18 +670,18 @@ xfs_da3_node_split( */ error = xfs_da_grow_inode(state->args, &blkno); if (error) - return(error); /* GROT: dir is inconsistent */ + return error; /* GROT: dir is inconsistent */ error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) - return(error); /* GROT: dir is inconsistent */ + return error; /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; xfs_da3_node_rebalance(state, oldblk, newblk); error = xfs_da3_blk_link(state, oldblk, newblk); if (error) - return(error); + return error; *result = 1; } else { *result = 0; @@ -721,7 +721,7 @@ xfs_da3_node_split( } } - return(0); + return 0; } /* @@ -963,9 +963,9 @@ xfs_da3_join( case XFS_ATTR_LEAF_MAGIC: error = xfs_attr3_leaf_toosmall(state, &action); if (error) - return(error); + return error; if (action == 0) - return(0); + return 0; xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: @@ -985,7 +985,7 @@ xfs_da3_join( xfs_da3_fixhashpath(state, &state->path); error = xfs_da3_node_toosmall(state, &action); if (error) - return(error); + return error; if (action == 0) return 0; xfs_da3_node_unbalance(state, drop_blk, save_blk); @@ -995,12 +995,12 @@ xfs_da3_join( error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) - return(error); + return error; error = xfs_da_shrink_inode(state->args, drop_blk->blkno, drop_blk->bp); drop_blk->bp = NULL; if (error) - return(error); + return error; } /* * We joined all the way to the top. If it turns out that @@ -1010,7 +1010,7 @@ xfs_da3_join( xfs_da3_node_remove(state, drop_blk); xfs_da3_fixhashpath(state, &state->path); error = xfs_da3_root_join(state, &state->path.blk[0]); - return(error); + return error; } #ifdef DEBUG @@ -1099,7 +1099,7 @@ xfs_da3_root_join( xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); error = xfs_da_shrink_inode(args, child, bp); - return(error); + return error; } /* @@ -1142,7 +1142,7 @@ xfs_da3_node_toosmall( dp->d_ops->node_hdr_from_disk(&nodehdr, node); if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ - return(0); /* blk over 50%, don't try to join */ + return 0; /* blk over 50%, don't try to join */ } /* @@ -1161,13 +1161,13 @@ xfs_da3_node_toosmall( error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) - return(error); + return error; if (retval) { *action = 0; } else { *action = 2; } - return(0); + return 0; } /* @@ -1194,7 +1194,7 @@ xfs_da3_node_toosmall( error = xfs_da3_node_read(state->args->trans, dp, blkno, -1, &bp, state->args->whichfork); if (error) - return(error); + return error; node = bp->b_addr; dp->d_ops->node_hdr_from_disk(&thdr, node); @@ -1486,7 +1486,7 @@ xfs_da3_node_lookup_int( if (error) { blk->blkno = 0; state->path.active--; - return(error); + return error; } curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); @@ -1579,25 +1579,25 @@ xfs_da3_node_lookup_int( args->blkno = blk->blkno; } else { ASSERT(0); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } - if (((retval == ENOENT) || (retval == ENOATTR)) && + if (((retval == -ENOENT) || (retval == -ENOATTR)) && (blk->hashval == args->hashval)) { error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) - return(error); + return error; if (retval == 0) { continue; } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { /* path_shift() gives ENOENT */ - retval = XFS_ERROR(ENOATTR); + retval = -ENOATTR; } } break; } *result = retval; - return(0); + return 0; } /*======================================================================== @@ -1692,7 +1692,7 @@ xfs_da3_blk_link( be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) - return(error); + return error; ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1713,7 +1713,7 @@ xfs_da3_blk_link( be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) - return(error); + return error; ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1726,7 +1726,7 @@ xfs_da3_blk_link( xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); - return(0); + return 0; } /* @@ -1772,7 +1772,7 @@ xfs_da3_blk_unlink( be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) - return(error); + return error; ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1789,7 +1789,7 @@ xfs_da3_blk_unlink( be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) - return(error); + return error; ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1801,7 +1801,7 @@ xfs_da3_blk_unlink( } xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); - return(0); + return 0; } /* @@ -1859,9 +1859,9 @@ xfs_da3_path_shift( } } if (level < 0) { - *result = XFS_ERROR(ENOENT); /* we're out of our tree */ + *result = -ENOENT; /* we're out of our tree */ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); - return(0); + return 0; } /* @@ -1883,7 +1883,7 @@ xfs_da3_path_shift( error = xfs_da3_node_read(args->trans, dp, blkno, -1, &blk->bp, args->whichfork); if (error) - return(error); + return error; info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || @@ -2004,7 +2004,7 @@ xfs_da_grow_inode_int( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; int w = args->whichfork; - xfs_drfsbno_t nblks = dp->i_d.di_nblocks; + xfs_rfsblock_t nblks = dp->i_d.di_nblocks; struct xfs_bmbt_irec map, *mapp; int nmap, error, got, i, mapi; @@ -2068,7 +2068,7 @@ xfs_da_grow_inode_int( if (got != count || mapp[0].br_startoff != *bno || mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != *bno + count) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto out_free_map; } @@ -2158,7 +2158,7 @@ xfs_da3_swap_lastblock( if (unlikely(lastoff == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } /* * Read the last block in the btree space. @@ -2209,7 +2209,7 @@ xfs_da3_swap_lastblock( sib_info->magic != dead_info->magic)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } sib_info->forw = cpu_to_be32(dead_blkno); @@ -2231,7 +2231,7 @@ xfs_da3_swap_lastblock( sib_info->magic != dead_info->magic)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } sib_info->back = cpu_to_be32(dead_blkno); @@ -2254,7 +2254,7 @@ xfs_da3_swap_lastblock( if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } level = par_hdr.level; @@ -2267,7 +2267,7 @@ xfs_da3_swap_lastblock( if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } par_blkno = be32_to_cpu(btree[entno].before); @@ -2294,7 +2294,7 @@ xfs_da3_swap_lastblock( if (unlikely(par_blkno == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); @@ -2305,7 +2305,7 @@ xfs_da3_swap_lastblock( if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto done; } btree = dp->d_ops->node_tree_p(par_node); @@ -2359,7 +2359,7 @@ xfs_da_shrink_inode( error = xfs_bunmapi(tp, dp, dead_blkno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, &done); - if (error == ENOSPC) { + if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; error = xfs_da3_swap_lastblock(args, &dead_blkno, @@ -2427,7 +2427,7 @@ xfs_buf_map_from_irec( map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP | KM_NOFS); if (!map) - return ENOMEM; + return -ENOMEM; *mapp = map; } @@ -2500,8 +2500,8 @@ xfs_dabuf_map( } if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { - error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); - if (unlikely(error == EFSCORRUPTED)) { + error = mappedbno == -2 ? -1 : -EFSCORRUPTED; + if (unlikely(error == -EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { int i; xfs_alert(mp, "%s: bno %lld dir: inode %lld", @@ -2561,7 +2561,7 @@ xfs_da_get_buf( bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, mapp, nmap, 0); - error = bp ? bp->b_error : XFS_ERROR(EIO); + error = bp ? bp->b_error : -EIO; if (error) { xfs_trans_brelse(trans, bp); goto out_free; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 6e153e399a77..6e153e399a77 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index c9aee52a37e2..c9aee52a37e2 100644 --- a/fs/xfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 0a49b0286372..0a49b0286372 100644 --- a/fs/xfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h index 623bbe8fd921..623bbe8fd921 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/libxfs/xfs_dinode.h diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 79670cda48ae..6cef22152fd6 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -108,7 +108,7 @@ xfs_da_mount( if (!mp->m_dir_geo || !mp->m_attr_geo) { kmem_free(mp->m_dir_geo); kmem_free(mp->m_attr_geo); - return ENOMEM; + return -ENOMEM; } /* set up directory geometry */ @@ -202,7 +202,7 @@ xfs_dir_ino_validate( xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -226,7 +226,7 @@ xfs_dir_init( args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) - return ENOMEM; + return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->dp = dp; @@ -261,7 +261,7 @@ xfs_dir_createname( args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) - return ENOMEM; + return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; @@ -314,18 +314,18 @@ xfs_dir_cilookup_result( int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) - return ENOENT; + return -ENOENT; if (args->cmpresult != XFS_CMP_CASE || !(args->op_flags & XFS_DA_OP_CILOOKUP)) - return EEXIST; + return -EEXIST; args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) - return ENOMEM; + return -ENOMEM; memcpy(args->value, name, len); args->valuelen = len; - return EEXIST; + return -EEXIST; } /* @@ -392,7 +392,7 @@ xfs_dir_lookup( rval = xfs_dir2_node_lookup(args); out_check_rval: - if (rval == EEXIST) + if (rval == -EEXIST) rval = 0; if (!rval) { *inum = args->inumber; @@ -428,7 +428,7 @@ xfs_dir_removename( args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) - return ENOMEM; + return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; @@ -493,7 +493,7 @@ xfs_dir_replace( args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) - return ENOMEM; + return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; @@ -555,7 +555,7 @@ xfs_dir_canenter( args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) - return ENOMEM; + return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c8e86b0b5e99..c8e86b0b5e99 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index c7cd3154026a..9628ceccfa02 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -91,9 +91,9 @@ xfs_dir3_block_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_dir3_block_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -108,7 +108,7 @@ xfs_dir3_block_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_block_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -392,7 +392,7 @@ xfs_dir2_block_addname( if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); if (!dup) - return XFS_ERROR(ENOSPC); + return -ENOSPC; return 0; } @@ -402,7 +402,7 @@ xfs_dir2_block_addname( if (!dup) { /* Don't have a space reservation: return no-space. */ if (args->total == 0) - return XFS_ERROR(ENOSPC); + return -ENOSPC; /* * Convert to the next larger format. * Then add the new entry in that format. @@ -647,7 +647,7 @@ xfs_dir2_block_lookup( args->filetype = dp->d_ops->data_get_ftype(dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); - return XFS_ERROR(error); + return error; } /* @@ -703,7 +703,7 @@ xfs_dir2_block_lookup_int( if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); xfs_trans_brelse(tp, bp); - return XFS_ERROR(ENOENT); + return -ENOENT; } } /* @@ -751,7 +751,7 @@ xfs_dir2_block_lookup_int( * No match, release the buffer and return ENOENT. */ xfs_trans_brelse(tp, bp); - return XFS_ERROR(ENOENT); + return -ENOENT; } /* @@ -1091,7 +1091,7 @@ xfs_dir2_sf_to_block( */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return XFS_ERROR(EIO); + return -EIO; } oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 8c2f6422648e..fdd803fecb8e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -100,7 +100,7 @@ __xfs_dir3_data_check( break; default: XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); - return EFSCORRUPTED; + return -EFSCORRUPTED; } /* @@ -256,7 +256,7 @@ xfs_dir3_data_reada_verify( xfs_dir3_data_verify(bp); return; default: - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); break; } @@ -270,9 +270,9 @@ xfs_dir3_data_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_dir3_data_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -287,7 +287,7 @@ xfs_dir3_data_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_data_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index fb0aad4440c1..a19174eb3cb2 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -183,9 +183,9 @@ __read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_dir3_leaf_verify(bp, magic)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -201,7 +201,7 @@ __write_verify( struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_leaf_verify(bp, magic)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -731,7 +731,7 @@ xfs_dir2_leaf_addname( if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) { xfs_trans_brelse(tp, lbp); - return XFS_ERROR(ENOSPC); + return -ENOSPC; } /* * Convert to node form. @@ -755,7 +755,7 @@ xfs_dir2_leaf_addname( */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, lbp); - return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; + return use_block == -1 ? -ENOSPC : 0; } /* * If no allocations are allowed, return now before we've @@ -763,7 +763,7 @@ xfs_dir2_leaf_addname( */ if (args->total == 0 && use_block == -1) { xfs_trans_brelse(tp, lbp); - return XFS_ERROR(ENOSPC); + return -ENOSPC; } /* * Need to compact the leaf entries, removing stale ones. @@ -1198,7 +1198,7 @@ xfs_dir2_leaf_lookup( error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); - return XFS_ERROR(error); + return error; } /* @@ -1327,13 +1327,13 @@ xfs_dir2_leaf_lookup_int( return 0; } /* - * No match found, return ENOENT. + * No match found, return -ENOENT. */ ASSERT(cidb == -1); if (dbp) xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); - return XFS_ERROR(ENOENT); + return -ENOENT; } /* @@ -1440,7 +1440,7 @@ xfs_dir2_leaf_removename( * Just go on, returning success, leaving the * empty block in place. */ - if (error == ENOSPC && args->total == 0) + if (error == -ENOSPC && args->total == 0) error = 0; xfs_dir3_leaf_check(dp, lbp); return error; @@ -1641,7 +1641,7 @@ xfs_dir2_leaf_trim_data( * Get rid of the data block. */ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); xfs_trans_brelse(tp, dbp); return error; } @@ -1815,7 +1815,7 @@ xfs_dir2_node_to_leaf( * punching out the middle of an extent, and this is an * isolated block. */ - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); return error; } fbp = NULL; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index da43d304fca2..2ae6ac2c11ae 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -117,9 +117,9 @@ xfs_dir3_free_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_dir3_free_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -134,7 +134,7 @@ xfs_dir3_free_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_free_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } @@ -406,7 +406,7 @@ xfs_dir2_leafn_add( * into other peoples memory */ if (index < 0) - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; /* * If there are already the maximum number of leaf entries in @@ -417,7 +417,7 @@ xfs_dir2_leafn_add( if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { if (!leafhdr.stale) - return XFS_ERROR(ENOSPC); + return -ENOSPC; compact = leafhdr.stale > 1; } else compact = 0; @@ -629,7 +629,7 @@ xfs_dir2_leafn_lookup_for_addname( XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } curfdb = newfdb; if (be16_to_cpu(bests[fi]) >= length) @@ -660,7 +660,7 @@ out: * Return the index, that will be the insertion point. */ *indexp = index; - return XFS_ERROR(ENOENT); + return -ENOENT; } /* @@ -789,7 +789,7 @@ xfs_dir2_leafn_lookup_for_entry( curbp->b_ops = &xfs_dir3_data_buf_ops; xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); if (cmp == XFS_CMP_EXACT) - return XFS_ERROR(EEXIST); + return -EEXIST; } } ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); @@ -812,7 +812,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extravalid = 0; } *indexp = index; - return XFS_ERROR(ENOENT); + return -ENOENT; } /* @@ -1133,7 +1133,7 @@ xfs_dir3_data_block_free( if (error == 0) { fbp = NULL; logfree = 0; - } else if (error != ENOSPC || args->total != 0) + } else if (error != -ENOSPC || args->total != 0) return error; /* * It's possible to get ENOSPC if there is no @@ -1287,7 +1287,7 @@ xfs_dir2_leafn_remove( * In this case just drop the buffer and some one else * will eventually get rid of the empty block. */ - else if (!(error == ENOSPC && args->total == 0)) + else if (!(error == -ENOSPC && args->total == 0)) return error; } /* @@ -1599,7 +1599,7 @@ xfs_dir2_node_addname( error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; - if (rval != ENOENT) { + if (rval != -ENOENT) { goto done; } /* @@ -1628,7 +1628,7 @@ xfs_dir2_node_addname( * It didn't work, we need to split the leaf block. */ if (args->total == 0) { - ASSERT(rval == ENOSPC); + ASSERT(rval == -ENOSPC); goto done; } /* @@ -1815,7 +1815,7 @@ xfs_dir2_node_addname_int( * Not allowed to allocate, return failure. */ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) - return XFS_ERROR(ENOSPC); + return -ENOSPC; /* * Allocate and initialize the new data block. @@ -1876,7 +1876,7 @@ xfs_dir2_node_addname_int( } XFS_ERROR_REPORT("xfs_dir2_node_addname_int", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } /* @@ -2042,8 +2042,8 @@ xfs_dir2_node_lookup( error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; - else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { - /* If a CI match, dup the actual name and return EEXIST */ + else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return -EEXIST */ xfs_dir2_data_entry_t *dep; dep = (xfs_dir2_data_entry_t *) @@ -2096,7 +2096,7 @@ xfs_dir2_node_removename( goto out_free; /* Didn't find it, upper layer screwed up. */ - if (rval != EEXIST) { + if (rval != -EEXIST) { error = rval; goto out_free; } @@ -2169,7 +2169,7 @@ xfs_dir2_node_replace( * It should be found, since the vnodeops layer has looked it up * and locked it. But paranoia is good. */ - if (rval == EEXIST) { + if (rval == -EEXIST) { struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. @@ -2272,7 +2272,7 @@ xfs_dir2_node_trim_free( * space reservation, when breaking up an extent into two * pieces. This is the last block of an extent. */ - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); xfs_trans_brelse(tp, bp); return error; } diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 27ce0794d196..27ce0794d196 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 53c3be619db5..5079e051ef08 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -51,10 +51,9 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); #else #define xfs_dir2_sf_check(args) #endif /* DEBUG */ -#if XFS_BIG_INUMS + static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); -#endif /* XFS_BIG_INUMS */ /* * Given a block directory (dp/block), calculate its size as a shortform (sf) @@ -117,10 +116,10 @@ xfs_dir2_block_sfsize( isdotdot = dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.'; -#if XFS_BIG_INUMS + if (!isdot) i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; -#endif + /* take into account the file type field */ if (!isdot && !isdotdot) { count++; @@ -251,7 +250,7 @@ xfs_dir2_block_to_sf( logflags = XFS_ILOG_CORE; error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); if (error) { - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); goto out; } @@ -299,7 +298,7 @@ xfs_dir2_sf_addname( trace_xfs_dir2_sf_addname(args); - ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); + ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); /* @@ -307,7 +306,7 @@ xfs_dir2_sf_addname( */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); @@ -318,7 +317,7 @@ xfs_dir2_sf_addname( */ incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); objchange = 0; -#if XFS_BIG_INUMS + /* * Do we have to change to 8 byte inodes? */ @@ -332,7 +331,7 @@ xfs_dir2_sf_addname( (uint)sizeof(xfs_dir2_ino4_t)); objchange = 1; } -#endif + new_isize = (int)dp->i_d.di_size + incr_isize; /* * Won't fit as shortform any more (due to size), @@ -345,7 +344,7 @@ xfs_dir2_sf_addname( * Just checking or no space reservation, it doesn't fit. */ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) - return XFS_ERROR(ENOSPC); + return -ENOSPC; /* * Convert to block form then add the name. */ @@ -370,10 +369,8 @@ xfs_dir2_sf_addname( */ else { ASSERT(pick == 2); -#if XFS_BIG_INUMS if (objchange) xfs_dir2_sf_toino8(args); -#endif xfs_dir2_sf_addname_hard(args, objchange, new_isize); } xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); @@ -425,10 +422,8 @@ xfs_dir2_sf_addname_easy( * Update the header and inode. */ sfp->count++; -#if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) sfp->i8count++; -#endif dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); } @@ -516,10 +511,8 @@ xfs_dir2_sf_addname_hard( dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); dp->d_ops->sf_put_ftype(sfep, args->filetype); sfp->count++; -#if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) sfp->i8count++; -#endif /* * If there's more left to copy, do that. */ @@ -593,13 +586,8 @@ xfs_dir2_sf_addname_pick( /* * If changing the inode number size, do it the hard way. */ -#if XFS_BIG_INUMS - if (objchange) { + if (objchange) return 2; - } -#else - ASSERT(objchange == 0); -#endif /* * If it won't fit at the end then do it the hard way (use the hole). */ @@ -650,7 +638,6 @@ xfs_dir2_sf_check( ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); - ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + @@ -738,7 +725,7 @@ xfs_dir2_sf_lookup( */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); @@ -751,7 +738,7 @@ xfs_dir2_sf_lookup( args->inumber = dp->i_ino; args->cmpresult = XFS_CMP_EXACT; args->filetype = XFS_DIR3_FT_DIR; - return XFS_ERROR(EEXIST); + return -EEXIST; } /* * Special case for .. @@ -761,7 +748,7 @@ xfs_dir2_sf_lookup( args->inumber = dp->d_ops->sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; args->filetype = XFS_DIR3_FT_DIR; - return XFS_ERROR(EEXIST); + return -EEXIST; } /* * Loop over all the entries trying to match ours. @@ -781,20 +768,20 @@ xfs_dir2_sf_lookup( args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); args->filetype = dp->d_ops->sf_get_ftype(sfep); if (cmp == XFS_CMP_EXACT) - return XFS_ERROR(EEXIST); + return -EEXIST; ci_sfep = sfep; } } ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* * Here, we can only be doing a lookup (not a rename or replace). - * If a case-insensitive match was not found, return ENOENT. + * If a case-insensitive match was not found, return -ENOENT. */ if (!ci_sfep) - return XFS_ERROR(ENOENT); + return -ENOENT; /* otherwise process the CI match as required by the caller */ error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return XFS_ERROR(error); + return error; } /* @@ -824,7 +811,7 @@ xfs_dir2_sf_removename( */ if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); @@ -847,7 +834,7 @@ xfs_dir2_sf_removename( * Didn't find it. */ if (i == sfp->count) - return XFS_ERROR(ENOENT); + return -ENOENT; /* * Calculate sizes. */ @@ -870,7 +857,6 @@ xfs_dir2_sf_removename( */ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; -#if XFS_BIG_INUMS /* * Are we changing inode number size? */ @@ -880,7 +866,6 @@ xfs_dir2_sf_removename( else sfp->i8count--; } -#endif xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); return 0; @@ -895,12 +880,8 @@ xfs_dir2_sf_replace( { xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ -#if XFS_BIG_INUMS || defined(DEBUG) xfs_ino_t ino=0; /* entry old inode number */ -#endif -#if XFS_BIG_INUMS int i8elevated; /* sf_toino8 set i8count=1 */ -#endif xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ @@ -914,13 +895,13 @@ xfs_dir2_sf_replace( */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); -#if XFS_BIG_INUMS + /* * New inode number is large, and need to convert to 8-byte inodes. */ @@ -951,17 +932,15 @@ xfs_dir2_sf_replace( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; } else i8elevated = 0; -#endif + ASSERT(args->namelen != 1 || args->name[0] != '.'); /* * Replace ..'s entry. */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { -#if XFS_BIG_INUMS || defined(DEBUG) ino = dp->d_ops->sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); -#endif dp->d_ops->sf_put_parent_ino(sfp, args->inumber); } /* @@ -972,10 +951,8 @@ xfs_dir2_sf_replace( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { -#if XFS_BIG_INUMS || defined(DEBUG) ino = dp->d_ops->sf_get_ino(sfp, sfep); ASSERT(args->inumber != ino); -#endif dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); dp->d_ops->sf_put_ftype(sfep, args->filetype); break; @@ -986,14 +963,11 @@ xfs_dir2_sf_replace( */ if (i == sfp->count) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); -#if XFS_BIG_INUMS if (i8elevated) xfs_dir2_sf_toino4(args); -#endif - return XFS_ERROR(ENOENT); + return -ENOENT; } } -#if XFS_BIG_INUMS /* * See if the old number was large, the new number is small. */ @@ -1020,13 +994,11 @@ xfs_dir2_sf_replace( if (!i8elevated) sfp->i8count++; } -#endif xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); return 0; } -#if XFS_BIG_INUMS /* * Convert from 8-byte inode numbers to 4-byte inode numbers. * The last 8-byte inode number is gone, but the count is still 1. @@ -1181,4 +1153,3 @@ xfs_dir2_sf_toino8( dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } -#endif /* XFS_BIG_INUMS */ diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index c2ac0c611ad8..bb969337efc8 100644 --- a/fs/xfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -257,9 +257,9 @@ xfs_dquot_buf_read_verify( struct xfs_mount *mp = bp->b_target->bt_mount; if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_dquot_buf_verify(mp, bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -277,7 +277,7 @@ xfs_dquot_buf_write_verify( struct xfs_mount *mp = bp->b_target->bt_mount; if (!xfs_dquot_buf_verify(mp, bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 34d85aca3058..7e42bba9a420 100644 --- a/fs/xfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -68,11 +68,7 @@ struct xfs_ifork; #define XFS_RTLOBIT(w) xfs_lowbit32(w) #define XFS_RTHIBIT(w) xfs_highbit32(w) -#if XFS_BIG_BLKNOS #define XFS_RTBLOCKLOG(b) xfs_highbit64(b) -#else -#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) -#endif /* * Dquot and dquot block format definitions @@ -304,23 +300,15 @@ typedef struct xfs_bmbt_rec_host { * Values and macros for delayed-allocation startblock fields. */ #define STARTBLOCKVALBITS 17 -#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) -#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASKBITS (15 + 20) #define STARTBLOCKMASK \ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define DSTARTBLOCKMASK \ - (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) static inline int isnullstartblock(xfs_fsblock_t x) { return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; } -static inline int isnulldstartblock(xfs_dfsbno_t x) -{ - return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; -} - static inline xfs_fsblock_t nullstartblock(int k) { ASSERT(k < (1 << STARTBLOCKVALBITS)); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5960e5593fe0..b62771f1f4b5 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -292,7 +292,7 @@ xfs_ialloc_inode_init( mp->m_bsize * blks_per_cluster, XBF_UNMAPPED); if (!fbuf) - return ENOMEM; + return -ENOMEM; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; @@ -380,7 +380,7 @@ xfs_ialloc_ag_alloc( newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) - return XFS_ERROR(ENOSPC); + return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated @@ -1385,7 +1385,7 @@ xfs_dialloc( if (error) { xfs_trans_brelse(tp, agbp); - if (error != ENOSPC) + if (error != -ENOSPC) goto out_error; xfs_perag_put(pag); @@ -1416,7 +1416,7 @@ nextag: agno = 0; if (agno == start_agno) { *inop = NULLFSINO; - return noroom ? ENOSPC : 0; + return noroom ? -ENOSPC : 0; } } @@ -1425,7 +1425,7 @@ out_alloc: return xfs_dialloc_ag(tp, agbp, parent, inop); out_error: xfs_perag_put(pag); - return XFS_ERROR(error); + return error; } STATIC int @@ -1682,7 +1682,7 @@ xfs_difree( xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", __func__, agno, mp->m_sb.sb_agcount); ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { @@ -1690,14 +1690,14 @@ xfs_difree( __func__, (unsigned long long)inode, (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= mp->m_sb.sb_agblocks) { xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", __func__, agbno, mp->m_sb.sb_agblocks); ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } /* * Get the allocation group header. @@ -1769,7 +1769,7 @@ xfs_imap_lookup( if (i) error = xfs_inobt_get_rec(cur, &rec, &i); if (!error && i == 0) - error = EINVAL; + error = -EINVAL; } xfs_trans_brelse(tp, agbp); @@ -1780,12 +1780,12 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || rec.ir_startino + mp->m_ialloc_inos <= agino) - return EINVAL; + return -EINVAL; /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) - return EINVAL; + return -EINVAL; *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); *offset_agbno = agbno - *chunk_agbno; @@ -1829,7 +1829,7 @@ xfs_imap( * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - return XFS_ERROR(EINVAL); + return -EINVAL; if (agno >= mp->m_sb.sb_agcount) { xfs_alert(mp, "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", @@ -1849,7 +1849,7 @@ xfs_imap( } xfs_stack_trace(); #endif /* DEBUG */ - return XFS_ERROR(EINVAL); + return -EINVAL; } blks_per_cluster = xfs_icluster_size_fsb(mp); @@ -1922,7 +1922,7 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - return XFS_ERROR(EINVAL); + return -EINVAL; } return 0; } @@ -2072,11 +2072,11 @@ xfs_agi_read_verify( if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, XFS_ERRTAG_IALLOC_READ_AGI, XFS_RANDOM_IALLOC_READ_AGI)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -2090,7 +2090,7 @@ xfs_agi_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agi_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 95ad1c002d60..95ad1c002d60 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 726f83a681a5..c9b06f30fe86 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -272,9 +272,9 @@ xfs_inobt_read_verify( struct xfs_buf *bp) { if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_inobt_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) { trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -288,7 +288,7 @@ xfs_inobt_write_verify( { if (!xfs_inobt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..d7ebea72c2d0 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cb35ae41d4a1..f18fd2da49f7 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -101,7 +101,7 @@ xfs_inode_buf_verify( return; } - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); #ifdef DEBUG xfs_alert(mp, @@ -174,14 +174,14 @@ xfs_imap_to_bp( (int)imap->im_len, buf_flags, &bp, &xfs_inode_buf_ops); if (error) { - if (error == EAGAIN) { + if (error == -EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); return error; } - if (error == EFSCORRUPTED && + if (error == -EFSCORRUPTED && (iget_flags & XFS_IGET_UNTRUSTED)) - return XFS_ERROR(EINVAL); + return -EINVAL; xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", __func__, error); @@ -390,7 +390,7 @@ xfs_iread( __func__, ip->i_ino); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto out_brelse; } diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 9308c47f2a52..9308c47f2a52 100644 --- a/fs/xfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index b031e8d0d928..6a00f7fed69d 100644 --- a/fs/xfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -102,7 +102,7 @@ xfs_iformat_fork( be64_to_cpu(dip->di_nblocks)); XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { @@ -111,7 +111,7 @@ xfs_iformat_fork( dip->di_forkoff); XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && @@ -121,7 +121,7 @@ xfs_iformat_fork( ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } switch (ip->i_d.di_mode & S_IFMT) { @@ -132,7 +132,7 @@ xfs_iformat_fork( if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } ip->i_d.di_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); @@ -153,7 +153,7 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(4)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } di_size = be64_to_cpu(dip->di_size); @@ -166,7 +166,7 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(5)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } size = (int)di_size; @@ -181,13 +181,13 @@ xfs_iformat_fork( default: XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } break; default: XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (error) { return error; @@ -211,7 +211,7 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(8)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -223,7 +223,7 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); break; default: - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; break; } if (error) { @@ -266,7 +266,7 @@ xfs_iformat_local( XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } ifp = XFS_IFORK_PTR(ip, whichfork); real_size = 0; @@ -322,7 +322,7 @@ xfs_iformat_extents( (unsigned long long) ip->i_ino, nex); XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } ifp->if_real_bytes = 0; @@ -350,7 +350,7 @@ xfs_iformat_extents( XFS_ERROR_REPORT("xfs_iformat_extents(2)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } } ifp->if_flags |= XFS_IFEXTENTS; @@ -399,7 +399,7 @@ xfs_iformat_btree( (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, mp, dip); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } ifp->if_broot_bytes = size; @@ -436,7 +436,7 @@ xfs_iread_extents( if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } nextents = XFS_IFORK_NEXTENTS(ip, whichfork); ifp = XFS_IFORK_PTR(ip, whichfork); @@ -528,7 +528,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes = (int)new_size; ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= XFS_IFORK_SIZE(ip, whichfork)); - memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; } @@ -575,7 +575,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); } kmem_free(ifp->if_broot); ifp->if_broot = new_broot; @@ -1692,7 +1692,7 @@ xfs_iext_idx_to_irec( } *idxp = page_idx; *erp_idxp = erp_idx; - return(erp); + return erp; } /* diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7d3b1ed6dcbe..7d3b1ed6dcbe 100644 --- a/fs/xfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h index 90efdaf1706f..4ff2278e147a 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/libxfs/xfs_inum.h @@ -54,11 +54,7 @@ struct xfs_mount; #define XFS_OFFBNO_TO_AGINO(mp,b,o) \ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) -#if XFS_BIG_INUMS #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#else -#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) -#endif #define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) #endif /* __XFS_INUM_H__ */ diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f0969c77bdbe..aff12f2d4428 100644 --- a/fs/xfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -380,7 +380,7 @@ typedef struct xfs_icdinode { xfs_ictimestamp_t di_mtime; /* time last modified */ xfs_ictimestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ - xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ @@ -516,7 +516,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) * EFI/EFD log format definitions */ typedef struct xfs_extent { - xfs_dfsbno_t ext_start; + xfs_fsblock_t ext_start; xfs_extlen_t ext_len; } xfs_extent_t; diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 1c55ccbb379d..1c55ccbb379d 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index ee7e0e80246b..ee7e0e80246b 100644 --- a/fs/xfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 137e20937077..1b0a08379759 100644 --- a/fs/xfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -98,8 +98,6 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ XFS_GQUOTA_ACTIVE | \ XFS_PQUOTA_ACTIVE)) -#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) #define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) #define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) #define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f4dd697cac08..f4dd697cac08 100644 --- a/fs/xfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 7703fa6770ff..ad525a5623a4 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -186,13 +186,13 @@ xfs_mount_validate_sb( */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); - return XFS_ERROR(EWRONGFS); + return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "bad version"); - return XFS_ERROR(EWRONGFS); + return -EWRONGFS; } /* @@ -220,7 +220,7 @@ xfs_mount_validate_sb( xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write.\n" "Filesystem can only be safely mounted read only."); - return XFS_ERROR(EINVAL); + return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, @@ -230,7 +230,7 @@ xfs_mount_validate_sb( "Filesystem can not be safely mounted by this kernel.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); - return XFS_ERROR(EINVAL); + return -EINVAL; } } @@ -238,13 +238,13 @@ xfs_mount_validate_sb( if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, "Version 5 of Super block has XFS_OQUOTA bits."); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, "Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (unlikely( @@ -252,7 +252,7 @@ xfs_mount_validate_sb( xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); + return -EINVAL; } if (unlikely( @@ -260,7 +260,7 @@ xfs_mount_validate_sb( xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); + return -EINVAL; } /* @@ -294,7 +294,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || sbp->sb_shared_vn != 0)) { xfs_notice(mp, "SB sanity check failed"); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } /* @@ -305,7 +305,7 @@ xfs_mount_validate_sb( "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - return XFS_ERROR(ENOSYS); + return -ENOSYS; } /* @@ -320,19 +320,19 @@ xfs_mount_validate_sb( default: xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); - return XFS_ERROR(ENOSYS); + return -ENOSYS; } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_warn(mp, "file system too large to be mounted on this system."); - return XFS_ERROR(EFBIG); + return -EFBIG; } if (check_inprogress && sbp->sb_inprogress) { xfs_warn(mp, "Offline file system operation in progress!"); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -386,10 +386,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) } } -void -xfs_sb_from_disk( +static void +__xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from) + xfs_dsb_t *from, + bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); @@ -445,6 +446,17 @@ xfs_sb_from_disk( to->sb_pad = 0; to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* Convert on-disk flags to in-memory flags? */ + if (convert_xquota) + xfs_sb_quota_from_disk(to); +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + __xfs_sb_from_disk(to, from, true); } static inline void @@ -577,7 +589,11 @@ xfs_sb_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + /* + * Use call variant which doesn't convert quota flags from disk + * format, because xfs_mount_validate_sb checks the on-disk flags. + */ + __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); /* * Only check the in progress field for the primary superblock as @@ -620,7 +636,7 @@ xfs_sb_read_verify( /* Only fail bad secondaries on a known V5 filesystem */ if (bp->b_bn == XFS_SB_DADDR || xfs_sb_version_hascrc(&mp->m_sb)) { - error = EFSBADCRC; + error = -EFSBADCRC; goto out_error; } } @@ -630,7 +646,7 @@ xfs_sb_read_verify( out_error: if (error) { xfs_buf_ioerror(bp, error); - if (error == EFSCORRUPTED || error == EFSBADCRC) + if (error == -EFSCORRUPTED || error == -EFSBADCRC) xfs_verifier_error(bp); } } @@ -653,7 +669,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EWRONGFS); + xfs_buf_ioerror(bp, -EWRONGFS); } static void diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index c43c2d609a24..2e739708afd3 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -87,11 +87,11 @@ struct xfs_trans; typedef struct xfs_sb { __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ __uint32_t sb_blocksize; /* logical block size, bytes */ - xfs_drfsbno_t sb_dblocks; /* number of data blocks */ - xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */ - xfs_drtbno_t sb_rextents; /* number of realtime extents */ + xfs_rfsblock_t sb_dblocks; /* number of data blocks */ + xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ + xfs_rtblock_t sb_rextents; /* number of realtime extents */ uuid_t sb_uuid; /* file system unique id */ - xfs_dfsbno_t sb_logstart; /* starting block of log if internal */ + xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 82404da2ca67..82404da2ca67 100644 --- a/fs/xfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 23c2f2577c8d..5782f037eab4 100644 --- a/fs/xfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -133,9 +133,9 @@ xfs_symlink_read_verify( return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) - xfs_buf_ioerror(bp, EFSBADCRC); + xfs_buf_ioerror(bp, -EFSBADCRC); else if (!xfs_symlink_verify(bp)) - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) xfs_verifier_error(bp); @@ -153,7 +153,7 @@ xfs_symlink_write_verify( return; if (!xfs_symlink_verify(bp)) { - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f2bda7c76b8a..f2bda7c76b8a 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 1097d14cd583..1097d14cd583 100644 --- a/fs/xfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c4579334d..bf9c4579334d 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6888ad886ff6..a65fa5dde6e9 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type) if (!xfs_acl) return ERR_PTR(-ENOMEM); - error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, &len, ATTR_ROOT); if (error) { /* @@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) len -= sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, + error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); kmem_free(xfs_acl); @@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) /* * A NULL ACL argument means we want to remove the ACL. */ - error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT); + error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); /* * If the attribute didn't exist to start with that's fine. @@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) iattr.ia_mode = mode; iattr.ia_ctime = current_fs_time(inode->i_sb); - error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index faaf716e2080..b984647c24db 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -240,7 +240,7 @@ xfs_end_io( done: if (error) - ioend->io_error = -error; + ioend->io_error = error; xfs_destroy_ioend(ioend); } @@ -308,14 +308,14 @@ xfs_map_blocks( int nimaps = 1; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { if (nonblocking) - return -XFS_ERROR(EAGAIN); + return -EAGAIN; xfs_ilock(ip, XFS_ILOCK_SHARED); } @@ -332,14 +332,14 @@ xfs_map_blocks( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) - return -XFS_ERROR(error); + return error; if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { error = xfs_iomap_write_allocate(ip, offset, imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); - return -XFS_ERROR(error); + return error; } #ifdef DEBUG @@ -502,7 +502,7 @@ xfs_submit_ioend( * time. */ if (fail) { - ioend->io_error = -fail; + ioend->io_error = fail; xfs_finish_ioend(ioend); continue; } @@ -1253,7 +1253,7 @@ __xfs_get_blocks( int new = 0; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); @@ -1302,7 +1302,7 @@ __xfs_get_blocks( error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); if (error) - return -error; + return error; new = 1; } else { /* @@ -1415,7 +1415,7 @@ __xfs_get_blocks( out_unlock: xfs_iunlock(ip, lockmode); - return -error; + return error; } int @@ -1753,11 +1753,72 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } +/* + * This is basically a copy of __set_page_dirty_buffers() with one + * small tweak: buffers beyond EOF do not get marked dirty. If we mark them + * dirty, we'll never be able to clean them because we don't write buffers + * beyond EOF, and that means we can't invalidate pages that span EOF + * that have been marked dirty. Further, the dirty state can leak into + * the file interior if the file is extended, resulting in all sorts of + * bad things happening as the state does not match the underlying data. + * + * XXX: this really indicates that bufferheads in XFS need to die. Warts like + * this only exist because of bufferheads and how the generic code manages them. + */ +STATIC int +xfs_vm_set_page_dirty( + struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + loff_t end_offset; + loff_t offset; + int newly_dirty; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + end_offset = i_size_read(inode); + offset = page_offset(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (offset < end_offset) + set_buffer_dirty(bh); + bh = bh->b_this_page; + offset += 1 << inode->i_blkbits; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) { + /* sigh - __set_page_dirty() is static, so copy it here, too */ + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return newly_dirty; +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, + .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .write_begin = xfs_vm_write_begin, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 09480c57f069..aa2a8b1838a2 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent( error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) { - return(error); + return error; } ASSERT(nmap == 1); ASSERT(map.br_startblock != DELAYSTARTBLOCK); @@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent( dp->i_mount->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) - return ENOMEM; + return -ENOMEM; xfs_trans_binval(*trans, bp); /* * Roll to next transaction. */ error = xfs_trans_roll(trans, dp); if (error) - return (error); + return error; } tblkno += map.br_blockcount; tblkcnt -= map.br_blockcount; } - return(0); + return 0; } /* @@ -227,7 +227,7 @@ xfs_attr3_node_inactive( */ if (level > XFS_DA_NODE_MAXDEPTH) { xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return XFS_ERROR(EIO); + return -EIO; } node = bp->b_addr; @@ -256,7 +256,7 @@ xfs_attr3_node_inactive( error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) - return(error); + return error; if (child_bp) { /* save for re-read later */ child_blkno = XFS_BUF_ADDR(child_bp); @@ -277,7 +277,7 @@ xfs_attr3_node_inactive( child_bp); break; default: - error = XFS_ERROR(EIO); + error = -EIO; xfs_trans_brelse(*trans, child_bp); break; } @@ -360,7 +360,7 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: - error = XFS_ERROR(EIO); + error = -EIO; xfs_trans_brelse(*trans, bp); break; } @@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp) error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); if (error) { xfs_trans_cancel(trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp) error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 90e2eeb21207..62db83ab6cbc 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b) sa = (xfs_attr_sf_sort_t *)a; sb = (xfs_attr_sf_sort_t *)b; if (sa->hash < sb->hash) { - return(-1); + return -1; } else if (sa->hash > sb->hash) { - return(1); + return 1; } else { - return(sa->entno - sb->entno); + return sa->entno - sb->entno; } } @@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) - return(0); + return 0; cursor = context->cursor; ASSERT(cursor != NULL); @@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); - return(0); + return 0; } /* do no more for a search callback */ @@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe); kmem_free(sbuf); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } sbp->entno = i; @@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } if (i == nsbuf) { kmem_free(sbuf); - return(0); + return 0; } /* @@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } kmem_free(sbuf); - return(0); + return 0; } STATIC int @@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (cursor->blkno > 0) { error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); - if ((error != 0) && (error != EFSCORRUPTED)) - return(error); + if ((error != 0) && (error != -EFSCORRUPTED)) + return error; if (bp) { struct xfs_attr_leaf_entry *entries; @@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); + return error; node = bp->b_addr; magic = be16_to_cpu(node->hdr.info.magic); if (magic == XFS_ATTR_LEAF_MAGIC || @@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) context->dp->i_mount, node); xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } dp->d_ops->node_hdr_from_disk(&nodehdr, node); @@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) context->cursor->blkno = 0; error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) - return XFS_ERROR(error); + return error; error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return XFS_ERROR(error); + return error; } int @@ -514,7 +514,7 @@ xfs_attr_list_int( XFS_STATS_INC(xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; + return -EIO; /* * Decide on what work routines to call based on the inode size. @@ -616,16 +616,16 @@ xfs_attr_list( * Validate the cursor. */ if (cursor->pad1 || cursor->pad2) - return(XFS_ERROR(EINVAL)); + return -EINVAL; if ((cursor->initted == 0) && (cursor->hashval || cursor->blkno || cursor->offset)) - return XFS_ERROR(EINVAL); + return -EINVAL; /* * Check for a properly aligned buffer. */ if (((long)buffer) & (sizeof(int)-1)) - return XFS_ERROR(EFAULT); + return -EFAULT; if (flags & ATTR_KERNOVAL) bufsize = 0; @@ -648,6 +648,6 @@ xfs_attr_list( alist->al_offset[0] = context.bufsize; error = xfs_attr_list_int(&context); - ASSERT(error >= 0); + ASSERT(error <= 0); return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 64731ef3324d..1707980f9a4b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -133,7 +133,7 @@ xfs_bmap_finish( mp = ntp->t_mountp; if (!XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? + (error == -EFSCORRUPTED) ? SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); return error; @@ -365,7 +365,7 @@ xfs_bmap_count_tree( xfs_trans_brelse(tp, bp); XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } xfs_trans_brelse(tp, bp); } else { @@ -425,14 +425,14 @@ xfs_bmap_count_blocks( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); + ASSERT(bno != NULLFSBLOCK); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; @@ -524,13 +524,13 @@ xfs_getbmap( if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); + return -EINVAL; } else if (unlikely( ip->i_d.di_aformat != 0 && ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } prealloced = 0; @@ -539,7 +539,7 @@ xfs_getbmap( if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); + return -EINVAL; if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ @@ -559,26 +559,26 @@ xfs_getbmap( bmv->bmv_entries = 0; return 0; } else if (bmv->bmv_length < 0) { - return XFS_ERROR(EINVAL); + return -EINVAL; } nex = bmv->bmv_count - 1; if (nex <= 0) - return XFS_ERROR(EINVAL); + return -EINVAL; bmvend = bmv->bmv_offset + bmv->bmv_length; if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) - return XFS_ERROR(ENOMEM); + return -ENOMEM; out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); if (!out) - return XFS_ERROR(ENOMEM); + return -ENOMEM; xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK) { if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; @@ -611,7 +611,7 @@ xfs_getbmap( /* * Allocate enough space to handle "subnex" maps at a time. */ - error = ENOMEM; + error = -ENOMEM; subnex = 16; map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) @@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) * have speculative prealloc/delalloc blocks to remove. */ if (VFS_I(ip)->i_size == 0 && - VN_CACHED(VFS_I(ip)) == 0 && + VFS_I(ip)->i_mapping->nrpages == 0 && ip->i_delayed_blks == 0) return false; @@ -882,7 +882,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); - return EAGAIN; + return -EAGAIN; } } @@ -955,14 +955,14 @@ xfs_alloc_file_space( trace_xfs_alloc_file_space(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; error = xfs_qm_dqattach(ip, 0); if (error) return error; if (len <= 0) - return XFS_ERROR(EINVAL); + return -EINVAL; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -1028,7 +1028,7 @@ xfs_alloc_file_space( /* * Free the transaction structure. */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); break; } @@ -1065,7 +1065,7 @@ xfs_alloc_file_space( allocated_fsb = imapp->br_blockcount; if (nimaps == 0) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; break; } @@ -1126,7 +1126,7 @@ xfs_zero_remaining_bytes( mp->m_rtdev_targp : mp->m_ddev_targp, BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) - return XFS_ERROR(ENOMEM); + return -ENOMEM; xfs_buf_unlock(bp); @@ -1158,7 +1158,7 @@ xfs_zero_remaining_bytes( XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); + error = -EIO; break; } xfs_buf_iorequest(bp); @@ -1176,7 +1176,7 @@ xfs_zero_remaining_bytes( XFS_BUF_WRITE(bp); if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); + error = -EIO; break; } xfs_buf_iorequest(bp); @@ -1234,7 +1234,7 @@ xfs_free_file_space( rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); if (error) goto out; @@ -1315,7 +1315,7 @@ xfs_free_file_space( /* * Free the transaction structure. */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); break; } @@ -1470,6 +1470,26 @@ xfs_collapse_file_space( start_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); + /* + * Writeback the entire file and force remove any post-eof blocks. The + * writeback prevents changes to the extent list via concurrent + * writeback and the eofblocks trim prevents the extent shift algorithm + * from running into a post-eof delalloc extent. + * + * XXX: This is a temporary fix until the extent shift loop below is + * converted to use offsets and lookups within the ILOCK rather than + * carrying around the index into the extent list for the next + * iteration. + */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + if (xfs_can_free_eofblocks(ip, true)) { + error = xfs_free_eofblocks(mp, ip, false); + if (error) + return error; + } + error = xfs_free_file_space(ip, offset, len); if (error) return error; @@ -1557,14 +1577,14 @@ xfs_swap_extents_check_format( /* Should never get a local format */ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) - return EINVAL; + return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ if (ip->i_d.di_nextents < tip->i_d.di_nextents) - return EINVAL; + return -EINVAL; /* * if the target inode is in extent form and the temp inode is in btree @@ -1573,19 +1593,19 @@ xfs_swap_extents_check_format( */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - return EINVAL; + return -EINVAL; /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; + return -EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; + return -EINVAL; /* * If we are in a btree format, check that the temp root block will fit @@ -1599,26 +1619,50 @@ xfs_swap_extents_check_format( if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(ip) && XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) - return EINVAL; + return -EINVAL; if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; + return -EINVAL; } /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) - return EINVAL; + return -EINVAL; if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; + return -EINVAL; } return 0; } int +xfs_swap_extent_flush( + struct xfs_inode *ip) +{ + int error; + + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + truncate_pagecache_range(VFS_I(ip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VFS_I(ip)->i_mapping->nrpages) + return -EINVAL; + + /* + * Don't try to swap extents on mmap()d files because we can't lock + * out races against page faults safely. + */ + if (mapping_mapped(VFS_I(ip)->i_mapping)) + return -EBUSY; + return 0; +} + +int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ @@ -1633,51 +1677,57 @@ xfs_swap_extents( int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; + int lock_flags; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { - error = XFS_ERROR(ENOMEM); + error = -ENOMEM; goto out; } /* - * we have to do two separate lock calls here to keep lockdep - * happy. If we try to get all the locks in one call, lock will - * report false positives when we drop the ILOCK and regain them - * below. + * Lock up the inodes against other IO and truncate to begin with. + * Then we can ensure the inodes are flushed and have no page cache + * safely. Once we have done this we can take the ilocks and do the rest + * of the checks. */ + lock_flags = XFS_IOLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(tip), 0, -1); - /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(VFS_I(tip)) != 0) { - error = XFS_ERROR(EINVAL); + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); goto out_unlock; } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { - error = XFS_ERROR(EFAULT); - goto out_unlock; + error = -EFAULT; + goto out_trans_cancel; } trace_xfs_swap_extent_before(ip, 0); @@ -1689,7 +1739,7 @@ xfs_swap_extents( xfs_notice(mp, "%s: inode 0x%llx format is incompatible for exchanging.", __func__, ip->i_ino); - goto out_unlock; + goto out_trans_cancel; } /* @@ -1703,43 +1753,9 @@ xfs_swap_extents( (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = XFS_ERROR(EBUSY); - goto out_unlock; + error = -EBUSY; + goto out_trans_cancel; } - - /* We need to fail if the file is memory mapped. Once we have tossed - * all existing pages, the page fault will have no option - * but to go to the filesystem for pages. By making the page fault call - * vop_read (or write in the case of autogrow) they block on the iolock - * until we have switched the extents. - */ - if (VN_MAPPED(VFS_I(ip))) { - error = XFS_ERROR(EBUSY); - goto out_unlock; - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL); - - /* - * There is a race condition here since we gave up the - * ilock. However, the data fork will not change since - * we have the iolock (locked for truncation too) so we - * are safe. We don't really care if non-io related - * fields change. - */ - truncate_pagecache_range(VFS_I(ip), 0, -1); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_IOLOCK_EXCL); - xfs_trans_cancel(tp, 0); - goto out; - } - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - /* * Count the number of extended attribute blocks */ @@ -1757,8 +1773,8 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); /* * Before we've swapped the forks, lets set the owners of the forks @@ -1887,8 +1903,8 @@ out: return error; out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); goto out; out_trans_cancel: diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7a34a1ae6552..cd7b8ca9b064 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -130,7 +130,7 @@ xfs_buf_get_maps( bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), KM_NOFS); if (!bp->b_maps) - return ENOMEM; + return -ENOMEM; return 0; } @@ -344,7 +344,7 @@ retry: if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; - error = ENOMEM; + error = -ENOMEM; goto out_free_pages; } @@ -465,7 +465,7 @@ _xfs_buf_find( eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); if (blkno >= eofs) { /* - * XXX (dgc): we should really be returning EFSCORRUPTED here, + * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports * returning a specific error on buffer lookup failures. */ @@ -1052,8 +1052,8 @@ xfs_buf_ioerror( xfs_buf_t *bp, int error) { - ASSERT(error >= 0 && error <= 0xffff); - bp->b_error = (unsigned short)error; + ASSERT(error <= 0 && error >= -1000); + bp->b_error = error; trace_xfs_buf_ioerror(bp, error, _RET_IP_); } @@ -1064,7 +1064,7 @@ xfs_buf_ioerror_alert( { xfs_alert(bp->b_target->bt_mount, "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", - (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); + (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } /* @@ -1083,7 +1083,7 @@ xfs_bioerror( /* * No need to wait until the buffer is unpinned, we aren't flushing it. */ - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); /* * We're calling xfs_buf_ioend, so delete XBF_DONE flag. @@ -1094,7 +1094,7 @@ xfs_bioerror( xfs_buf_ioend(bp, 0); - return EIO; + return -EIO; } /* @@ -1127,13 +1127,13 @@ xfs_bioerror_relse( * There's no reason to mark error for * ASYNC buffers. */ - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); complete(&bp->b_iowait); } else { xfs_buf_relse(bp); } - return EIO; + return -EIO; } STATIC int @@ -1199,7 +1199,7 @@ xfs_buf_bio_end_io( * buffers that require multiple bios to complete. */ if (!bp->b_error) - xfs_buf_ioerror(bp, -error); + xfs_buf_ioerror(bp, error); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1286,7 +1286,7 @@ next_chunk: * because the caller (xfs_buf_iorequest) holds a count itself. */ atomic_dec(&bp->b_io_remaining); - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); bio_put(bio); } @@ -1330,6 +1330,20 @@ _xfs_buf_ioapply( SHUTDOWN_CORRUPT_INCORE); return; } + } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * non-crc filesystems don't attach verifiers during + * log recovery, so don't warn for such filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_warn(mp, + "%s: no ops on block 0x%llx/0x%x", + __func__, bp->b_bn, bp->b_length); + xfs_hex_dump(bp->b_addr, 64); + dump_stack(); + } } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; @@ -1628,7 +1642,7 @@ xfs_setsize_buftarg( xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s", sectorsize, name); - return EINVAL; + return -EINVAL; } /* Set up device logical sector size mask */ diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3a7a5523d3dc..c753183900b3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -178,7 +178,7 @@ typedef struct xfs_buf { atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ - unsigned short b_error; /* error code on I/O */ + int b_error; /* error code on I/O */ const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 4654338b03fc..76007deed31f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -488,7 +488,7 @@ xfs_buf_item_unpin( xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); @@ -725,7 +725,7 @@ xfs_buf_item_get_format( bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), KM_SLEEP); if (!bip->bli_formats) - return ENOMEM; + return -ENOMEM; return 0; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 48e99afb9cb0..f1b69edcdf31 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -95,7 +95,7 @@ xfs_dir2_sf_getdents( */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); @@ -677,7 +677,7 @@ xfs_readdir( trace_xfs_readdir(dp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); + return -EIO; ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 4f11ef011139..13d08a1b390e 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -124,7 +124,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -166,11 +166,11 @@ xfs_ioc_trim( int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (!blk_queue_discard(q)) - return -XFS_ERROR(EOPNOTSUPP); + return -EOPNOTSUPP; if (copy_from_user(&range, urange, sizeof(range))) - return -XFS_ERROR(EFAULT); + return -EFAULT; /* * Truncating down the len isn't actually quite correct, but using @@ -182,7 +182,7 @@ xfs_ioc_trim( if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || range.len < mp->m_sb.sb_blocksize) - return -XFS_ERROR(EINVAL); + return -EINVAL; start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; @@ -195,7 +195,7 @@ xfs_ioc_trim( end_agno = xfs_daddr_to_agno(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { - error = -xfs_trim_extents(mp, agno, start, end, minlen, + error = xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); if (error) last_error = error; @@ -206,7 +206,7 @@ xfs_ioc_trim( range.len = XFS_FSB_TO_B(mp, blocks_trimmed); if (copy_to_user(urange, &range, sizeof(range))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -222,11 +222,11 @@ xfs_discard_extents( trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, busyp->length); - error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_NOFS, 0); - if (error && error != EOPNOTSUPP) { + if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llu,%u], error %d", (unsigned long long)busyp->bno, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3ee0cd43edc0..63c2de49f61d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -327,7 +327,7 @@ xfs_qm_dqalloc( */ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return (ESRCH); + return -ESRCH; } xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); @@ -354,7 +354,7 @@ xfs_qm_dqalloc( mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error1; } bp->b_ops = &xfs_dquot_buf_ops; @@ -400,7 +400,7 @@ xfs_qm_dqalloc( error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return (error); + return error; } STATIC int @@ -426,7 +426,7 @@ xfs_qm_dqrepair( if (error) { ASSERT(*bpp == NULL); - return XFS_ERROR(error); + return error; } (*bpp)->b_ops = &xfs_dquot_buf_ops; @@ -442,7 +442,7 @@ xfs_qm_dqrepair( if (error) { /* repair failed, we're screwed */ xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EIO); + return -EIO; } } @@ -480,7 +480,7 @@ xfs_qm_dqtobp( * didn't have the quota inode lock. */ xfs_iunlock(quotip, lock_mode); - return ESRCH; + return -ESRCH; } /* @@ -508,7 +508,7 @@ xfs_qm_dqtobp( * We don't allocate unless we're asked to */ if (!(flags & XFS_QMOPT_DQALLOC)) - return ENOENT; + return -ENOENT; ASSERT(tp); error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, @@ -530,7 +530,7 @@ xfs_qm_dqtobp( mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * mp->m_quotainfo->qi_dqperchunk; ASSERT(bp == NULL); @@ -539,7 +539,7 @@ xfs_qm_dqtobp( if (error) { ASSERT(bp == NULL); - return XFS_ERROR(error); + return error; } } @@ -547,7 +547,7 @@ xfs_qm_dqtobp( *O_bpp = bp; *O_ddpp = bp->b_addr + dqp->q_bufoffset; - return (0); + return 0; } @@ -715,7 +715,7 @@ xfs_qm_dqget( if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { - return (ESRCH); + return -ESRCH; } #ifdef DEBUG @@ -723,7 +723,7 @@ xfs_qm_dqget( if ((xfs_dqerror_target == mp->m_ddev_targp) && (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { xfs_debug(mp, "Returning error in dqget"); - return (EIO); + return -EIO; } } @@ -796,14 +796,14 @@ restart: } else { /* inode stays locked on return */ xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); + return -ESRCH; } } mutex_lock(&qi->qi_tree_lock); - error = -radix_tree_insert(tree, id, dqp); + error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { - WARN_ON(error != EEXIST); + WARN_ON(error != -EEXIST); /* * Duplicate found. Just throw away the new dquot and start @@ -829,7 +829,7 @@ restart: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; - return (0); + return 0; } /* @@ -966,7 +966,7 @@ xfs_qm_dqflush( SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&mp->m_ail->xa_lock); - error = XFS_ERROR(EIO); + error = -EIO; goto out_unlock; } @@ -974,7 +974,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -992,7 +993,7 @@ xfs_qm_dqflush( xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return XFS_ERROR(EIO); + return -EIO; } /* This is the only portion of data that needs to persist */ @@ -1045,7 +1046,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); + return -EIO; } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 68a68f704837..c24c67e22a2a 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) } } +/* + * Check whether a dquot is under low free space conditions. We assume the quota + * is enabled and enforced. + */ +static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) +{ + int64_t freesp; + + freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + return false; +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index edac5b057d28..b92fd7bc49e3 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -27,29 +27,6 @@ #ifdef DEBUG -int xfs_etrap[XFS_ERROR_NTRAP] = { - 0, -}; - -int -xfs_error_trap(int e) -{ - int i; - - if (!e) - return 0; - for (i = 0; i < XFS_ERROR_NTRAP; i++) { - if (xfs_etrap[i] == 0) - break; - if (e != xfs_etrap[i]) - continue; - xfs_notice(NULL, "%s: error %d", __func__, e); - BUG(); - break; - } - return e; -} - int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; @@ -190,7 +167,7 @@ xfs_verifier_error( struct xfs_mount *mp = bp->b_target->bt_mount; xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", - bp->b_error == EFSBADCRC ? "CRC error" : "corruption", + bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c1c57d4a4b5d..279a76e52791 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -18,15 +18,6 @@ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ -#ifdef DEBUG -#define XFS_ERROR_NTRAP 10 -extern int xfs_etrap[XFS_ERROR_NTRAP]; -extern int xfs_error_trap(int); -#define XFS_ERROR(e) xfs_error_trap(e) -#else -#define XFS_ERROR(e) (e) -#endif - struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, @@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ XFS_ERRLEVEL_LOW, NULL); \ - error = XFS_ERROR(EFSCORRUPTED); \ + error = -EFSCORRUPTED; \ goto l; \ } \ } @@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ XFS_ERRLEVEL_LOW, NULL); \ - return XFS_ERROR(EFSCORRUPTED); \ + return -EFSCORRUPTED; \ } \ } diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 753e467aa1a5..5a6bd5d8779a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -147,9 +147,9 @@ xfs_nfs_get_inode( * We don't use ESTALE directly down the chain to not * confuse applications using bulkstat that expect EINVAL. */ - if (error == EINVAL || error == ENOENT) - error = ESTALE; - return ERR_PTR(-error); + if (error == -EINVAL || error == -ENOENT) + error = -ESTALE; + return ERR_PTR(error); } if (ip->i_d.di_gen != generation) { @@ -217,7 +217,7 @@ xfs_fs_get_parent( error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) - return ERR_PTR(-error); + return ERR_PTR(error); return d_obtain_alias(VFS_I(cip)); } @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index fb7a4c1ce1c5..c4327419dc5c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - return EFSCORRUPTED; + return -EFSCORRUPTED; } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f66779d7a46..de5368c803f9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_dinode.h" +#include "xfs_icache.h" #include <linux/aio.h> #include <linux/dcache.h> @@ -155,7 +156,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -179,7 +180,7 @@ xfs_file_fsync( return error; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); @@ -225,7 +226,7 @@ xfs_file_fsync( !log_flushed) xfs_blkdev_issue_flush(mp->m_ddev_targp); - return -error; + return error; } STATIC ssize_t @@ -246,11 +247,11 @@ xfs_file_read_iter( XFS_STATS_INC(xs_read_calls); if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; + ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & IO_ISDIRECT)) { + if (unlikely(ioflags & XFS_IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -258,7 +259,7 @@ xfs_file_read_iter( if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; - return -XFS_ERROR(EINVAL); + return -EINVAL; } } @@ -283,19 +284,29 @@ xfs_file_read_iter( * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, - pos, -1); + pos, pos + size - 1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } - truncate_pagecache_range(VFS_I(ip), pos, -1); + + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + size - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } @@ -325,7 +336,7 @@ xfs_file_splice_read( XFS_STATS_INC(xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; @@ -524,7 +535,7 @@ restart: xfs_rw_ilock(ip, *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = xfs_zero_eof(ip, *pos, i_size_read(inode)); if (error) return error; } @@ -594,7 +605,7 @@ xfs_file_dio_aio_write( /* DIO must be aligned to device logical sector size */ if ((pos | count) & target->bt_logical_sectormask) - return -XFS_ERROR(EINVAL); + return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) @@ -631,10 +642,19 @@ xfs_file_dio_aio_write( if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, -1); + pos, pos + count - 1); if (ret) goto out; - truncate_pagecache_range(VFS_I(ip), pos, -1); + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; } /* @@ -689,14 +709,28 @@ write_retry: ret = generic_perform_write(file, from, pos); if (likely(ret >= 0)) iocb->ki_pos = pos + ret; + /* - * If we just got an ENOSPC, try to write back all dirty inodes to - * convert delalloc space to free up some of the excess reserved - * metadata space. + * If we hit a space limit, try to free up some lingering preallocated + * space before returning an error. In the case of ENOSPC, first try to + * write back all dirty inodes to free up some of the excess reserved + * metadata space. This reduces the chances that the eofblocks scan + * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this + * also behaves as a filter to prevent too many eofblocks scans from + * running at the same time. */ - if (ret == -ENOSPC && !enospc) { + if (ret == -EDQUOT && !enospc) { + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + enospc = 1; xfs_flush_inodes(ip->i_mount); + eofb.eof_scan_owner = ip->i_ino; /* for locking */ + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; } @@ -772,7 +806,7 @@ xfs_file_fallocate( unsigned blksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blksize_mask || len & blksize_mask) { - error = EINVAL; + error = -EINVAL; goto out_unlock; } @@ -781,7 +815,7 @@ xfs_file_fallocate( * in which case it is effectively a truncate operation */ if (offset + len >= i_size_read(inode)) { - error = EINVAL; + error = -EINVAL; goto out_unlock; } @@ -794,7 +828,7 @@ xfs_file_fallocate( if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; - error = -inode_newsize_ok(inode, new_size); + error = inode_newsize_ok(inode, new_size); if (error) goto out_unlock; } @@ -844,7 +878,7 @@ xfs_file_fallocate( out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; + return error; } @@ -889,7 +923,7 @@ xfs_file_release( struct inode *inode, struct file *filp) { - return -xfs_release(XFS_I(inode)); + return xfs_release(XFS_I(inode)); } STATIC int @@ -918,7 +952,7 @@ xfs_file_readdir( error = xfs_readdir(ip, ctx, bufsize); if (error) - return -error; + return error; return 0; } @@ -1184,7 +1218,7 @@ xfs_seek_data( isize = i_size_read(inode); if (start >= isize) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } @@ -1206,7 +1240,7 @@ xfs_seek_data( /* No extents at given offset, must be beyond EOF */ if (nmap == 0) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } @@ -1237,7 +1271,7 @@ xfs_seek_data( * we are reading after EOF if nothing in map[1]. */ if (nmap == 1) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } @@ -1250,7 +1284,7 @@ xfs_seek_data( fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; start = XFS_FSB_TO_B(mp, fsbno); if (start >= isize) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } } @@ -1262,7 +1296,7 @@ out_unlock: xfs_iunlock(ip, lock); if (error) - return -error; + return error; return offset; } @@ -1282,13 +1316,13 @@ xfs_seek_hole( int error; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } @@ -1307,7 +1341,7 @@ xfs_seek_hole( /* No extents at given offset, must be beyond EOF */ if (nmap == 0) { - error = ENXIO; + error = -ENXIO; goto out_unlock; } @@ -1370,7 +1404,7 @@ out_unlock: xfs_iunlock(ip, lock); if (error) - return -error; + return error; return offset; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 8ec81bed7992..e92730c1d3ca 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -258,7 +258,7 @@ next_ag: if (*agp == NULLAGNUMBER) return 0; - err = ENOMEM; + err = -ENOMEM; item = kmem_alloc(sizeof(*item), KM_MAYFAIL); if (!item) goto out_put_ag; @@ -268,7 +268,7 @@ next_ag: err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { - if (err == EEXIST) + if (err == -EEXIST) err = 0; goto out_free_item; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index d34703dbcb42..18dc721ca19f 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks { ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) /* Used for sanity checks on superblock */ -#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks) -#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \ +#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks) +#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \ (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) /* @@ -375,6 +375,9 @@ struct xfs_fs_eofblocks { #define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ #define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ #define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm; + * kernel only, not included in + * valid mask */ #define XFS_EOF_FLAGS_VALID \ (XFS_EOF_FLAGS_SYNC | \ XFS_EOF_FLAGS_UID | \ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d2295561570a..f91de1ef05e1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -168,7 +168,7 @@ xfs_growfs_data_private( nb = in->newblocks; pct = in->imaxpct; if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) - return XFS_ERROR(EINVAL); + return -EINVAL; if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; @@ -176,7 +176,7 @@ xfs_growfs_data_private( XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) - return EIO; + return -EIO; if (bp->b_error) { error = bp->b_error; xfs_buf_relse(bp); @@ -191,7 +191,7 @@ xfs_growfs_data_private( nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) - return XFS_ERROR(EINVAL); + return -EINVAL; } new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; @@ -229,7 +229,7 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, &xfs_agf_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -270,7 +270,7 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, &xfs_agfl_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -298,7 +298,7 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, &xfs_agi_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -336,7 +336,7 @@ xfs_growfs_data_private( &xfs_allocbt_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -365,7 +365,7 @@ xfs_growfs_data_private( BTOBB(mp->m_sb.sb_blocksize), 0, &xfs_allocbt_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -395,7 +395,7 @@ xfs_growfs_data_private( BTOBB(mp->m_sb.sb_blocksize), 0, &xfs_inobt_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -420,7 +420,7 @@ xfs_growfs_data_private( BTOBB(mp->m_sb.sb_blocksize), 0, &xfs_inobt_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error0; } @@ -531,7 +531,7 @@ xfs_growfs_data_private( bp->b_ops = &xfs_sb_buf_ops; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); } else - error = ENOMEM; + error = -ENOMEM; } /* @@ -576,17 +576,17 @@ xfs_growfs_log_private( nb = in->newblocks; if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) - return XFS_ERROR(EINVAL); + return -EINVAL; if (nb == mp->m_sb.sb_logblocks && in->isint == (mp->m_sb.sb_logstart != 0)) - return XFS_ERROR(EINVAL); + return -EINVAL; /* * Moving the log is hard, need new interfaces to sync * the log first, hold off all activity while moving it. * Can have shorter or longer log in the same space, * or transform internal to external log or vice versa. */ - return XFS_ERROR(ENOSYS); + return -ENOSYS; } /* @@ -604,9 +604,9 @@ xfs_growfs_data( int error; if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); + return -EPERM; if (!mutex_trylock(&mp->m_growlock)) - return XFS_ERROR(EWOULDBLOCK); + return -EWOULDBLOCK; error = xfs_growfs_data_private(mp, in); mutex_unlock(&mp->m_growlock); return error; @@ -620,9 +620,9 @@ xfs_growfs_log( int error; if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); + return -EPERM; if (!mutex_trylock(&mp->m_growlock)) - return XFS_ERROR(EWOULDBLOCK); + return -EWOULDBLOCK; error = xfs_growfs_log_private(mp, in); mutex_unlock(&mp->m_growlock); return error; @@ -674,7 +674,7 @@ xfs_reserve_blocks( /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { if (!outval) - return EINVAL; + return -EINVAL; outval->resblks = mp->m_resblks; outval->resblks_avail = mp->m_resblks_avail; return 0; @@ -757,7 +757,7 @@ out: int error; error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); - if (error == ENOSPC) + if (error == -ENOSPC) goto retry; } return 0; @@ -818,7 +818,7 @@ xfs_fs_goingdown( SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); break; default: - return XFS_ERROR(EINVAL); + return -EINVAL; } return 0; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c48df5f25b9f..981b2cf51985 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,9 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -158,7 +161,7 @@ xfs_iget_cache_hit( if (ip->i_ino != ino) { trace_xfs_iget_skip(ip); XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; + error = -EAGAIN; goto out_error; } @@ -176,7 +179,7 @@ xfs_iget_cache_hit( if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { trace_xfs_iget_skip(ip); XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; + error = -EAGAIN; goto out_error; } @@ -184,7 +187,7 @@ xfs_iget_cache_hit( * If lookup is racing with unlink return an error immediately. */ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; + error = -ENOENT; goto out_error; } @@ -206,7 +209,7 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - error = -inode_init_always(mp->m_super, inode); + error = inode_init_always(mp->m_super, inode); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -243,7 +246,7 @@ xfs_iget_cache_hit( /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { trace_xfs_iget_skip(ip); - error = EAGAIN; + error = -EAGAIN; goto out_error; } @@ -285,7 +288,7 @@ xfs_iget_cache_miss( ip = xfs_inode_alloc(mp, ino); if (!ip) - return ENOMEM; + return -ENOMEM; error = xfs_iread(mp, tp, ip, flags); if (error) @@ -294,7 +297,7 @@ xfs_iget_cache_miss( trace_xfs_iget_miss(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; + error = -ENOENT; goto out_destroy; } @@ -305,7 +308,7 @@ xfs_iget_cache_miss( * recurse into the file system. */ if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; + error = -EAGAIN; goto out_destroy; } @@ -341,7 +344,7 @@ xfs_iget_cache_miss( if (unlikely(error)) { WARN_ON(error != -EEXIST); XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; + error = -EAGAIN; goto out_preload_end; } spin_unlock(&pag->pag_ici_lock); @@ -408,7 +411,7 @@ xfs_iget( /* reject inode numbers outside existing AGs */ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; + return -EINVAL; /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); @@ -445,7 +448,7 @@ again: return 0; out_error_or_again: - if (error == EAGAIN) { + if (error == -EAGAIN) { delay(1); goto again; } @@ -489,18 +492,18 @@ xfs_inode_ag_walk_grab( /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return EFSCORRUPTED; + return -EFSCORRUPTED; /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) - return ENOENT; + return -ENOENT; /* inode is valid */ return 0; out_unlock_noent: spin_unlock(&ip->i_flags_lock); - return ENOENT; + return -ENOENT; } STATIC int @@ -583,16 +586,16 @@ restart: continue; error = execute(batch[i], flags, args); IRELE(batch[i]); - if (error == EAGAIN) { + if (error == -EAGAIN) { skipped++; continue; } - if (error && last_error != EFSCORRUPTED) + if (error && last_error != -EFSCORRUPTED) last_error = error; } /* bail out if the filesystem is corrupted. */ - if (error == EFSCORRUPTED) + if (error == -EFSCORRUPTED) break; cond_resched(); @@ -652,11 +655,11 @@ xfs_inode_ag_iterator( xfs_perag_put(pag); if (error) { last_error = error; - if (error == EFSCORRUPTED) + if (error == -EFSCORRUPTED) break; } } - return XFS_ERROR(last_error); + return last_error; } int @@ -680,11 +683,11 @@ xfs_inode_ag_iterator_tag( xfs_perag_put(pag); if (error) { last_error = error; - if (error == EFSCORRUPTED) + if (error == -EFSCORRUPTED) break; } } - return XFS_ERROR(last_error); + return last_error; } /* @@ -944,7 +947,7 @@ restart: * see the stale flag set on the inode. */ error = xfs_iflush(ip, &bp); - if (error == EAGAIN) { + if (error == -EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); @@ -997,7 +1000,7 @@ out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* - * We could return EAGAIN here to make reclaim rescan the inode tree in + * We could return -EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and the reclaim work never goes back to * the idle state. Instead, return 0 to let the next scheduled @@ -1100,7 +1103,7 @@ restart: if (!batch[i]) continue; error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != EFSCORRUPTED) + if (error && last_error != -EFSCORRUPTED) last_error = error; } @@ -1129,7 +1132,7 @@ restart: trylock = 0; goto restart; } - return XFS_ERROR(last_error); + return last_error; } int @@ -1203,6 +1206,30 @@ xfs_inode_match_id( return 1; } +/* + * A union-based inode filtering algorithm. Process the inode if any of the + * criteria match. This is for global/internal scans only. + */ +STATIC int +xfs_inode_match_id_union( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) == eofb->eof_prid) + return 1; + + return 0; +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1211,6 +1238,10 @@ xfs_inode_free_eofblocks( { int ret; struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1228,19 +1259,31 @@ xfs_inode_free_eofblocks( return 0; if (eofb) { - if (!xfs_inode_match_id(ip, eofb)) + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) return 0; /* skip the inode if the file size is too small */ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, true); + ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); /* don't revisit the inode if we're not waiting */ - if (ret == EAGAIN && !(flags & SYNC_WAIT)) + if (ret == -EAGAIN && !(flags & SYNC_WAIT)) ret = 0; return ret; @@ -1260,6 +1303,55 @@ xfs_icache_free_eofblocks( eofb, XFS_ICI_EOFBLOCKS_TAG); } +/* + * Run eofblocks scans on the quotas applicable to the inode. For inodes with + * multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + int scan = 0; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Set the scan owner to avoid a potential livelock. Otherwise, the scan + * can repeatedly trylock on the inode we're currently processing. We + * run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + scan = 1; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + scan = 1; + } + } + + if (scan) + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + + return scan; +} + void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 9cf017b899be..46748b86b12f 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,6 +27,7 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; + xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, @@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user( struct xfs_eofblocks *dst) { if (src->eof_version != XFS_EOFBLOCKS_VERSION) - return EINVAL; + return -EINVAL; if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) - return EINVAL; + return -EINVAL; if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || memchr_inv(src->pad64, 0, sizeof(src->pad64))) - return EINVAL; + return -EINVAL; dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; + dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); if (!uid_valid(dst->eof_uid)) - return EINVAL; + return -EINVAL; } dst->eof_gid = INVALID_GID; if (src->eof_flags & XFS_EOF_FLAGS_GID) { dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); if (!gid_valid(dst->eof_gid)) - return EINVAL; + return -EINVAL; } return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a6115fe1ac94..fea3c92fb3f0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -583,7 +583,7 @@ xfs_lookup( trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); + return -EIO; lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); @@ -893,7 +893,7 @@ xfs_dir_ialloc( } if (!ialloc_context && !ip) { *ipp = NULL; - return XFS_ERROR(ENOSPC); + return -ENOSPC; } /* @@ -1088,7 +1088,7 @@ xfs_create( trace_xfs_create(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1125,12 +1125,12 @@ xfs_create( */ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(tp, &tres, resblks, 0); - if (error == ENOSPC) { + if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); error = xfs_trans_reserve(tp, &tres, resblks, 0); } - if (error == ENOSPC) { + if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; error = xfs_trans_reserve(tp, &tres, 0, 0); @@ -1165,7 +1165,7 @@ xfs_create( error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, resblks > 0, &ip, &committed); if (error) { - if (error == ENOSPC) + if (error == -ENOSPC) goto out_trans_cancel; goto out_trans_abort; } @@ -1184,7 +1184,7 @@ xfs_create( &first_block, &free_list, resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); goto out_trans_abort; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -1274,7 +1274,7 @@ xfs_create_tmpfile( uint resblks; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1293,7 +1293,7 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_reserve(tp, tres, resblks, 0); - if (error == ENOSPC) { + if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); @@ -1311,7 +1311,7 @@ xfs_create_tmpfile( error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, resblks > 0, &ip, NULL); if (error) { - if (error == ENOSPC) + if (error == -ENOSPC) goto out_trans_cancel; goto out_trans_abort; } @@ -1382,7 +1382,7 @@ xfs_link( ASSERT(!S_ISDIR(sip->i_d.di_mode)); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; error = xfs_qm_dqattach(sip, 0); if (error) @@ -1396,7 +1396,7 @@ xfs_link( cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); - if (error == ENOSPC) { + if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } @@ -1417,7 +1417,7 @@ xfs_link( */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { - error = XFS_ERROR(EXDEV); + error = -EXDEV; goto error_return; } @@ -1635,8 +1635,8 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { - error = -filemap_flush(VFS_I(ip)->i_mapping); + if (ip->i_delayed_blks > 0) { + error = filemap_flush(VFS_I(ip)->i_mapping); if (error) return error; } @@ -1673,7 +1673,7 @@ xfs_release( return 0; error = xfs_free_eofblocks(mp, ip, true); - if (error && error != EAGAIN) + if (error && error != -EAGAIN) return error; /* delalloc blocks after truncation means it really is dirty */ @@ -1772,7 +1772,7 @@ xfs_inactive_ifree( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0); if (error) { - if (error == ENOSPC) { + if (error == -ENOSPC) { xfs_warn_ratelimited(mp, "Failed to remove inode(s) from unlinked list. " "Please free space, unmount and run xfs_repair."); @@ -2219,7 +2219,7 @@ xfs_ifree_cluster( XBF_UNMAPPED); if (!bp) - return ENOMEM; + return -ENOMEM; /* * This buffer may not have been correctly initialised as we @@ -2491,7 +2491,7 @@ xfs_remove( trace_xfs_remove(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; error = xfs_qm_dqattach(dp, 0); if (error) @@ -2521,12 +2521,12 @@ xfs_remove( */ resblks = XFS_REMOVE_SPACE_RES(mp); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); - if (error == ENOSPC) { + if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); } if (error) { - ASSERT(error != ENOSPC); + ASSERT(error != -ENOSPC); cancel_flags = 0; goto out_trans_cancel; } @@ -2543,11 +2543,11 @@ xfs_remove( if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); + error = -ENOTEMPTY; goto out_trans_cancel; } if (!xfs_dir_isempty(ip)) { - error = XFS_ERROR(ENOTEMPTY); + error = -ENOTEMPTY; goto out_trans_cancel; } @@ -2582,7 +2582,7 @@ xfs_remove( error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); if (error) { - ASSERT(error != ENOENT); + ASSERT(error != -ENOENT); goto out_bmap_cancel; } @@ -2702,7 +2702,7 @@ xfs_rename( cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); - if (error == ENOSPC) { + if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } @@ -2747,7 +2747,7 @@ xfs_rename( */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { - error = XFS_ERROR(EXDEV); + error = -EXDEV; goto error_return; } @@ -2770,7 +2770,7 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == ENOSPC) + if (error == -ENOSPC) goto error_return; if (error) goto abort_return; @@ -2795,7 +2795,7 @@ xfs_rename( */ if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { - error = XFS_ERROR(EEXIST); + error = -EEXIST; goto error_return; } } @@ -2847,7 +2847,7 @@ xfs_rename( error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, &first_block, &free_list, spaceres); - ASSERT(error != EEXIST); + ASSERT(error != -EEXIST); if (error) goto abort_return; } @@ -3055,7 +3055,7 @@ cluster_corrupt_out: if (bp->b_iodone) { XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); xfs_buf_ioend(bp, 0); } else { xfs_buf_stale(bp); @@ -3069,7 +3069,7 @@ cluster_corrupt_out: xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } /* @@ -3124,7 +3124,7 @@ xfs_iflush( * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); + error = -EIO; goto abort_out; } @@ -3167,7 +3167,7 @@ corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; abort_out: /* * Unlocks the flush lock @@ -3331,5 +3331,5 @@ xfs_iflush_int( return 0; corrupt_out: - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f72bffa67266..c10e3fadd9af 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -398,4 +398,14 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* + * Flags for read/write calls + */ +#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ +#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { XFS_IO_ISDIRECT, "DIRECT" }, \ + { XFS_IO_INVIS, "INVIS"} + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a640137b3573..de5a7be36e60 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -788,5 +788,5 @@ xfs_inode_item_format_convert( in_f->ilf_boffset = in_f64->ilf_boffset; return 0; } - return EFSCORRUPTED; + return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8bc1bbce7451..3799695b9249 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -207,7 +207,7 @@ xfs_open_by_handle( struct path path; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; dentry = xfs_handlereq_to_dentry(parfilp, hreq); if (IS_ERR(dentry)) @@ -216,7 +216,7 @@ xfs_open_by_handle( /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -XFS_ERROR(EPERM); + error = -EPERM; goto out_dput; } @@ -228,18 +228,18 @@ xfs_open_by_handle( fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -XFS_ERROR(EPERM); + error = -EPERM; goto out_dput; } if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -XFS_ERROR(EACCES); + error = -EACCES; goto out_dput; } /* Can't write directories. */ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -XFS_ERROR(EISDIR); + error = -EISDIR; goto out_dput; } @@ -282,7 +282,7 @@ xfs_readlink_by_handle( int error; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; dentry = xfs_handlereq_to_dentry(parfilp, hreq); if (IS_ERR(dentry)) @@ -290,22 +290,22 @@ xfs_readlink_by_handle( /* Restrict this handle operation to symlinks only. */ if (!S_ISLNK(dentry->d_inode->i_mode)) { - error = -XFS_ERROR(EINVAL); + error = -EINVAL; goto out_dput; } if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { - error = -XFS_ERROR(EFAULT); + error = -EFAULT; goto out_dput; } link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); if (!link) { - error = -XFS_ERROR(ENOMEM); + error = -ENOMEM; goto out_dput; } - error = -xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; error = readlink_copy(hreq->ohandle, olen, link); @@ -330,10 +330,10 @@ xfs_set_dmattrs( int error; if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); + return -EPERM; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); @@ -364,9 +364,9 @@ xfs_fssetdm_by_handle( struct dentry *dentry; if (!capable(CAP_MKNOD)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(parfilp); if (error) @@ -379,16 +379,16 @@ xfs_fssetdm_by_handle( } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { - error = -XFS_ERROR(EPERM); + error = -EPERM; goto out; } if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { - error = -XFS_ERROR(EFAULT); + error = -EFAULT; goto out; } - error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: @@ -409,18 +409,18 @@ xfs_attrlist_by_handle( char *kbuf; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || al_hreq.buflen > XATTR_LIST_MAX) - return -XFS_ERROR(EINVAL); + return -EINVAL; /* * Reject flags, only allow namespaces. */ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -XFS_ERROR(EINVAL); + return -EINVAL; dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) @@ -431,7 +431,7 @@ xfs_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -455,20 +455,20 @@ xfs_attrmulti_attr_get( __uint32_t flags) { unsigned char *kbuf; - int error = EFAULT; + int error = -EFAULT; if (*len > XATTR_SIZE_MAX) - return EINVAL; + return -EINVAL; kbuf = kmem_zalloc_large(*len, KM_SLEEP); if (!kbuf) - return ENOMEM; + return -ENOMEM; error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) goto out_kfree; if (copy_to_user(ubuf, kbuf, *len)) - error = EFAULT; + error = -EFAULT; out_kfree: kmem_free(kbuf); @@ -484,20 +484,17 @@ xfs_attrmulti_attr_set( __uint32_t flags) { unsigned char *kbuf; - int error = EFAULT; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return EPERM; + return -EPERM; if (len > XATTR_SIZE_MAX) - return EINVAL; + return -EINVAL; kbuf = memdup_user(ubuf, len); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); - - return error; + return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); } int @@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove( __uint32_t flags) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return EPERM; + return -EPERM; return xfs_attr_remove(XFS_I(inode), name, flags); } @@ -524,9 +521,9 @@ xfs_attrmulti_by_handle( unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; /* overflow check */ if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) @@ -536,18 +533,18 @@ xfs_attrmulti_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = E2BIG; + error = -E2BIG; size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) goto out_dput; ops = memdup_user(am_hreq.ops, size); if (IS_ERR(ops)) { - error = -PTR_ERR(ops); + error = PTR_ERR(ops); goto out_dput; } - error = ENOMEM; + error = -ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -557,7 +554,7 @@ xfs_attrmulti_by_handle( ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = ERANGE; + error = -ERANGE; if (ops[i].am_error < 0) break; @@ -588,19 +585,19 @@ xfs_attrmulti_by_handle( mnt_drop_write_file(parfilp); break; default: - ops[i].am_error = EINVAL; + ops[i].am_error = -EINVAL; } } if (copy_to_user(am_hreq.ops, ops, size)) - error = XFS_ERROR(EFAULT); + error = -EFAULT; kfree(attr_name); out_kfree_ops: kfree(ops); out_dput: dput(dentry); - return -error; + return error; } int @@ -625,16 +622,16 @@ xfs_ioc_space( */ if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && !capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -XFS_ERROR(EPERM); + return -EPERM; if (!(filp->f_mode & FMODE_WRITE)) - return -XFS_ERROR(EBADF); + return -EBADF; if (!S_ISREG(inode->i_mode)) - return -XFS_ERROR(EINVAL); + return -EINVAL; error = mnt_want_write_file(filp); if (error) @@ -652,7 +649,7 @@ xfs_ioc_space( bf->l_start += XFS_ISIZE(ip); break; default: - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_unlock; } @@ -669,7 +666,7 @@ xfs_ioc_space( case XFS_IOC_UNRESVSP: case XFS_IOC_UNRESVSP64: if (bf->l_len <= 0) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_unlock; } break; @@ -682,7 +679,7 @@ xfs_ioc_space( bf->l_start > mp->m_super->s_maxbytes || bf->l_start + bf->l_len < 0 || bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_unlock; } @@ -723,7 +720,7 @@ xfs_ioc_space( break; default: ASSERT(0); - error = XFS_ERROR(EINVAL); + error = -EINVAL; } if (error) @@ -739,7 +736,7 @@ xfs_ioc_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - if (!(ioflags & IO_INVIS)) { + if (!(ioflags & XFS_IO_INVIS)) { ip->i_d.di_mode &= ~S_ISUID; if (ip->i_d.di_mode & S_IXGRP) ip->i_d.di_mode &= ~S_ISGID; @@ -759,7 +756,7 @@ xfs_ioc_space( out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); mnt_drop_write_file(filp); - return -error; + return error; } STATIC int @@ -781,41 +778,41 @@ xfs_ioc_bulkstat( return -EPERM; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if ((count = bulkreq.icount) <= 0) - return -XFS_ERROR(EINVAL); + return -EINVAL; if (bulkreq.ubuffer == NULL) - return -XFS_ERROR(EINVAL); + return -EINVAL; if (cmd == XFS_IOC_FSINUMBERS) error = xfs_inumbers(mp, &inlast, &count, bulkreq.ubuffer, xfs_inumbers_fmt); else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) - error = xfs_bulkstat_single(mp, &inlast, - bulkreq.ubuffer, &done); + error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer, + sizeof(xfs_bstat_t), NULL, &done); else /* XFS_IOC_FSBULKSTAT */ error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, sizeof(xfs_bstat_t), bulkreq.ubuffer, &done); if (error) - return -error; + return error; if (bulkreq.ocount != NULL) { if (copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -XFS_ERROR(EFAULT); + return -EFAULT; } return 0; @@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1( error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) - return -error; + return error; /* * Caller should have passed an argument of type @@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1( * xfs_fsop_geom_t that xfs_fs_geometry() fills in. */ if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -853,10 +850,10 @@ xfs_ioc_fsgeometry( error = xfs_fs_geometry(mp, &fsgeo, 4); if (error) - return -error; + return error; if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1041,16 +1038,16 @@ xfs_ioctl_setattr( trace_xfs_ioctl_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); + return -EROFS; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; /* * Disallow 32bit project ids when projid32bit feature is not enabled. */ if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) - return XFS_ERROR(EINVAL); + return -EINVAL; /* * If disk quotas is on, we make sure that the dquots do exist on disk, @@ -1088,7 +1085,7 @@ xfs_ioctl_setattr( * CAP_FSETID capability is applicable. */ if (!inode_owner_or_capable(VFS_I(ip))) { - code = XFS_ERROR(EPERM); + code = -EPERM; goto error_return; } @@ -1099,7 +1096,7 @@ xfs_ioctl_setattr( */ if (mask & FSX_PROJID) { if (current_user_ns() != &init_user_ns) { - code = XFS_ERROR(EINVAL); + code = -EINVAL; goto error_return; } @@ -1122,7 +1119,7 @@ xfs_ioctl_setattr( if (ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ + code = -EINVAL; /* EFBIG? */ goto error_return; } @@ -1141,7 +1138,7 @@ xfs_ioctl_setattr( extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); if (extsize_fsb > MAXEXTLEN) { - code = XFS_ERROR(EINVAL); + code = -EINVAL; goto error_return; } @@ -1153,13 +1150,13 @@ xfs_ioctl_setattr( } else { size = mp->m_sb.sb_blocksize; if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { - code = XFS_ERROR(EINVAL); + code = -EINVAL; goto error_return; } } if (fa->fsx_extsize % size) { - code = XFS_ERROR(EINVAL); + code = -EINVAL; goto error_return; } } @@ -1173,7 +1170,7 @@ xfs_ioctl_setattr( if ((ip->i_d.di_nextents || ip->i_delayed_blks) && (XFS_IS_REALTIME_INODE(ip)) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ + code = -EINVAL; /* EFBIG? */ goto error_return; } @@ -1184,7 +1181,7 @@ xfs_ioctl_setattr( if ((mp->m_sb.sb_rblocks == 0) || (mp->m_sb.sb_rextsize == 0) || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { - code = XFS_ERROR(EINVAL); + code = -EINVAL; goto error_return; } } @@ -1198,7 +1195,7 @@ xfs_ioctl_setattr( (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) { - code = XFS_ERROR(EPERM); + code = -EPERM; goto error_return; } } @@ -1301,7 +1298,7 @@ xfs_ioc_fssetxattr( return error; error = xfs_ioctl_setattr(ip, &fa, mask); mnt_drop_write_file(filp); - return -error; + return error; } STATIC int @@ -1346,7 +1343,7 @@ xfs_ioc_setxflags( return error; error = xfs_ioctl_setattr(ip, &fa, mask); mnt_drop_write_file(filp); - return -error; + return error; } STATIC int @@ -1356,7 +1353,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) /* copy only getbmap portion (not getbmapx) */ if (copy_to_user(base, bmv, sizeof(struct getbmap))) - return XFS_ERROR(EFAULT); + return -EFAULT; *ap += sizeof(struct getbmap); return 0; @@ -1373,23 +1370,23 @@ xfs_ioc_getbmap( int error; if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (bmx.bmv_count < 2) - return -XFS_ERROR(EINVAL); + return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & IO_INVIS) + if (ioflags & XFS_IO_INVIS) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, (struct getbmap *)arg+1); if (error) - return -error; + return error; /* copy back header - only size of getbmap */ if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1399,7 +1396,7 @@ xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) struct getbmapx __user *base = *ap; if (copy_to_user(base, bmv, sizeof(struct getbmapx))) - return XFS_ERROR(EFAULT); + return -EFAULT; *ap += sizeof(struct getbmapx); return 0; @@ -1414,22 +1411,22 @@ xfs_ioc_getbmapx( int error; if (copy_from_user(&bmx, arg, sizeof(bmx))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (bmx.bmv_count < 2) - return -XFS_ERROR(EINVAL); + return -EINVAL; if (bmx.bmv_iflags & (~BMV_IF_VALID)) - return -XFS_ERROR(EINVAL); + return -EINVAL; error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, (struct getbmapx *)arg+1); if (error) - return -error; + return error; /* copy back header */ if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1445,33 +1442,33 @@ xfs_ioc_swapext( /* Pull information for the target fd */ f = fdget((int)sxp->sx_fdtarget); if (!f.file) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out; } if (!(f.file->f_mode & FMODE_WRITE) || !(f.file->f_mode & FMODE_READ) || (f.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); + error = -EBADF; goto out_put_file; } tmp = fdget((int)sxp->sx_fdtmp); if (!tmp.file) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_put_file; } if (!(tmp.file->f_mode & FMODE_WRITE) || !(tmp.file->f_mode & FMODE_READ) || (tmp.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); + error = -EBADF; goto out_put_tmp_file; } if (IS_SWAPFILE(file_inode(f.file)) || IS_SWAPFILE(file_inode(tmp.file))) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_put_tmp_file; } @@ -1479,17 +1476,17 @@ xfs_ioc_swapext( tip = XFS_I(file_inode(tmp.file)); if (ip->i_mount != tip->i_mount) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_put_tmp_file; } if (ip->i_ino == tip->i_ino) { - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto out_put_tmp_file; } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = XFS_ERROR(EIO); + error = -EIO; goto out_put_tmp_file; } @@ -1523,7 +1520,7 @@ xfs_file_ioctl( int error; if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; trace_xfs_file_ioctl(ip); @@ -1542,7 +1539,7 @@ xfs_file_ioctl( xfs_flock64_t bf; if (copy_from_user(&bf, arg, sizeof(bf))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); } case XFS_IOC_DIOINFO: { @@ -1555,7 +1552,7 @@ xfs_file_ioctl( da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1588,7 +1585,7 @@ xfs_file_ioctl( struct fsdmidata dmi; if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) @@ -1597,7 +1594,7 @@ xfs_file_ioctl( error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_GETBMAP: @@ -1613,14 +1610,14 @@ xfs_file_ioctl( xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(hreq))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_find_handle(cmd, &hreq); } case XFS_IOC_OPEN_BY_HANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_FSSETDM_BY_HANDLE: @@ -1630,7 +1627,7 @@ xfs_file_ioctl( xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE: @@ -1643,13 +1640,13 @@ xfs_file_ioctl( struct xfs_swapext sxp; if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_FSCOUNTS: { @@ -1657,10 +1654,10 @@ xfs_file_ioctl( error = xfs_fs_counts(mp, &out); if (error) - return -error; + return error; if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1672,10 +1669,10 @@ xfs_file_ioctl( return -EPERM; if (mp->m_flags & XFS_MOUNT_RDONLY) - return -XFS_ERROR(EROFS); + return -EROFS; if (copy_from_user(&inout, arg, sizeof(inout))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) @@ -1686,10 +1683,10 @@ xfs_file_ioctl( error = xfs_reserve_blocks(mp, &in, &inout); mnt_drop_write_file(filp); if (error) - return -error; + return error; if (copy_to_user(arg, &inout, sizeof(inout))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1701,10 +1698,10 @@ xfs_file_ioctl( error = xfs_reserve_blocks(mp, NULL, &out); if (error) - return -error; + return error; if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -1713,42 +1710,42 @@ xfs_file_ioctl( xfs_growfs_data_t in; if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_data(mp, &in); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_FSGROWFSLOG: { xfs_growfs_log_t in; if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_log(mp, &in); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_FSGROWFSRT: { xfs_growfs_rt_t in; if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_rt(mp, &in); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_GOINGDOWN: { @@ -1758,10 +1755,9 @@ xfs_file_ioctl( return -EPERM; if (get_user(in, (__uint32_t __user *)arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; - error = xfs_fs_goingdown(mp, in); - return -error; + return xfs_fs_goingdown(mp, in); } case XFS_IOC_ERROR_INJECTION: { @@ -1771,18 +1767,16 @@ xfs_file_ioctl( return -EPERM; if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); + return -EFAULT; - error = xfs_errortag_add(in.errtag, mp); - return -error; + return xfs_errortag_add(in.errtag, mp); } case XFS_IOC_ERROR_CLEARALL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = xfs_errortag_clearall(mp, 1); - return -error; + return xfs_errortag_clearall(mp, 1); case XFS_IOC_FREE_EOFBLOCKS: { struct xfs_fs_eofblocks eofb; @@ -1792,16 +1786,16 @@ xfs_file_ioctl( return -EPERM; if (mp->m_flags & XFS_MOUNT_RDONLY) - return -XFS_ERROR(EROFS); + return -EROFS; if (copy_from_user(&eofb, arg, sizeof(eofb))) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = xfs_fs_eofblocks_from_user(&eofb, &keofb); if (error) - return -error; + return error; - return -xfs_icache_free_eofblocks(mp, &keofb); + return xfs_icache_free_eofblocks(mp, &keofb); } default: diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 944d5baa710a..a554646ff141 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -28,7 +28,6 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_vnode.h" #include "xfs_inode.h" #include "xfs_itable.h" #include "xfs_error.h" @@ -56,7 +55,7 @@ xfs_compat_flock64_copyin( get_user(bf->l_sysid, &arg32->l_sysid) || get_user(bf->l_pid, &arg32->l_pid) || copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1( error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) - return -error; + return error; /* The 32-bit variant simply has some padding at the end */ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin( { if (get_user(in->newblocks, &arg32->newblocks) || get_user(in->imaxpct, &arg32->imaxpct)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin( { if (get_user(in->newblocks, &arg32->newblocks) || get_user(in->extsize, &arg32->extsize)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } STATIC int xfs_inumbers_fmt_compat( void __user *ubuffer, - const xfs_inogrp_t *buffer, + const struct xfs_inogrp *buffer, long count, long *written) { @@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat( if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) - return -XFS_ERROR(EFAULT); + return -EFAULT; } *written = count * sizeof(*p32); return 0; @@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin( if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) - return -XFS_ERROR(EFAULT); + return -EFAULT; bstime->tv_sec = sec32; return 0; } @@ -164,7 +163,7 @@ xfs_ioctl32_bstat_copyin( get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || get_user(bstat->bs_aextents, &bstat32->bs_aextents)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -180,7 +179,7 @@ xfs_bstime_store_compat( sec32 = p->tv_sec; if (put_user(sec32, &p32->tv_sec) || put_user(p->tv_nsec, &p32->tv_nsec)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return 0; } @@ -195,7 +194,7 @@ xfs_bulkstat_one_fmt_compat( compat_xfs_bstat_t __user *p32 = ubuffer; if (ubsize < sizeof(*p32)) - return XFS_ERROR(ENOMEM); + return -ENOMEM; if (put_user(buffer->bs_ino, &p32->bs_ino) || put_user(buffer->bs_mode, &p32->bs_mode) || @@ -218,7 +217,7 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) - return XFS_ERROR(EFAULT); + return -EFAULT; if (ubused) *ubused = sizeof(*p32); return 0; @@ -256,30 +255,30 @@ xfs_compat_ioc_bulkstat( /* should be called again (unused here, but used in dmapi) */ if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; if (get_user(addr, &p32->lastip)) - return -XFS_ERROR(EFAULT); + return -EFAULT; bulkreq.lastip = compat_ptr(addr); if (get_user(bulkreq.icount, &p32->icount) || get_user(addr, &p32->ubuffer)) - return -XFS_ERROR(EFAULT); + return -EFAULT; bulkreq.ubuffer = compat_ptr(addr); if (get_user(addr, &p32->ocount)) - return -XFS_ERROR(EFAULT); + return -EFAULT; bulkreq.ocount = compat_ptr(addr); if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if ((count = bulkreq.icount) <= 0) - return -XFS_ERROR(EINVAL); + return -EINVAL; if (bulkreq.ubuffer == NULL) - return -XFS_ERROR(EINVAL); + return -EINVAL; if (cmd == XFS_IOC_FSINUMBERS_32) { error = xfs_inumbers(mp, &inlast, &count, @@ -294,17 +293,17 @@ xfs_compat_ioc_bulkstat( xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, &done); } else - error = XFS_ERROR(EINVAL); + error = -EINVAL; if (error) - return -error; + return error; if (bulkreq.ocount != NULL) { if (copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -XFS_ERROR(EFAULT); + return -EFAULT; } return 0; @@ -318,7 +317,7 @@ xfs_compat_handlereq_copyin( compat_xfs_fsop_handlereq_t hreq32; if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; hreq->fd = hreq32.fd; hreq->path = compat_ptr(hreq32.path); @@ -352,19 +351,19 @@ xfs_compat_attrlist_by_handle( char *kbuf; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || al_hreq.buflen > XATTR_LIST_MAX) - return -XFS_ERROR(EINVAL); + return -EINVAL; /* * Reject flags, only allow namespaces. */ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -XFS_ERROR(EINVAL); + return -EINVAL; dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) @@ -376,7 +375,7 @@ xfs_compat_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -404,10 +403,10 @@ xfs_compat_attrmulti_by_handle( unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&am_hreq, arg, sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; /* overflow check */ if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) @@ -417,7 +416,7 @@ xfs_compat_attrmulti_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = E2BIG; + error = -E2BIG; size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) goto out_dput; @@ -428,7 +427,7 @@ xfs_compat_attrmulti_by_handle( goto out_dput; } - error = ENOMEM; + error = -ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -439,7 +438,7 @@ xfs_compat_attrmulti_by_handle( compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = ERANGE; + error = -ERANGE; if (ops[i].am_error < 0) break; @@ -470,19 +469,19 @@ xfs_compat_attrmulti_by_handle( mnt_drop_write_file(parfilp); break; default: - ops[i].am_error = EINVAL; + ops[i].am_error = -EINVAL; } } if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) - error = XFS_ERROR(EFAULT); + error = -EFAULT; kfree(attr_name); out_kfree_ops: kfree(ops); out_dput: dput(dentry); - return -error; + return error; } STATIC int @@ -496,26 +495,26 @@ xfs_compat_fssetdm_by_handle( struct dentry *dentry; if (!capable(CAP_MKNOD)) - return -XFS_ERROR(EPERM); + return -EPERM; if (copy_from_user(&dmhreq, arg, sizeof(compat_xfs_fsop_setdm_handlereq_t))) - return -XFS_ERROR(EFAULT); + return -EFAULT; dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { - error = -XFS_ERROR(EPERM); + error = -EPERM; goto out; } if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { - error = -XFS_ERROR(EFAULT); + error = -EFAULT; goto out; } - error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: @@ -537,7 +536,7 @@ xfs_file_compat_ioctl( int error; if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; trace_xfs_file_compat_ioctl(ip); @@ -588,7 +587,7 @@ xfs_file_compat_ioctl( struct xfs_flock64 bf; if (xfs_compat_flock64_copyin(&bf, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); } @@ -598,25 +597,25 @@ xfs_file_compat_ioctl( struct xfs_growfs_data in; if (xfs_compat_growfs_data_copyin(&in, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_data(mp, &in); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_FSGROWFSRT_32: { struct xfs_growfs_rt in; if (xfs_compat_growfs_rt_copyin(&in, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_growfs_rt(mp, &in); mnt_drop_write_file(filp); - return -error; + return error; } #endif /* long changes size, but xfs only copiese out 32 bits */ @@ -633,13 +632,13 @@ xfs_file_compat_ioctl( if (copy_from_user(&sxp, sxu, offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) - return -XFS_ERROR(EFAULT); + return -EFAULT; error = mnt_want_write_file(filp); if (error) return error; error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); - return -error; + return error; } case XFS_IOC_FSBULKSTAT_32: case XFS_IOC_FSBULKSTAT_SINGLE_32: @@ -651,7 +650,7 @@ xfs_file_compat_ioctl( struct xfs_fsop_handlereq hreq; if (xfs_compat_handlereq_copyin(&hreq, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); return xfs_find_handle(cmd, &hreq); } @@ -659,14 +658,14 @@ xfs_file_compat_ioctl( struct xfs_fsop_handlereq hreq; if (xfs_compat_handlereq_copyin(&hreq, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_READLINK_BY_HANDLE_32: { struct xfs_fsop_handlereq hreq; if (xfs_compat_handlereq_copyin(&hreq, arg)) - return -XFS_ERROR(EFAULT); + return -EFAULT; return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE_32: @@ -676,6 +675,6 @@ xfs_file_compat_ioctl( case XFS_IOC_FSSETDM_BY_HANDLE_32: return xfs_compat_fssetdm_by_handle(filp, arg); default: - return -XFS_ERROR(ENOIOCTLCMD); + return -ENOIOCTLCMD; } } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6d3ec2b6ee29..e9c47b6f5e5a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -110,7 +110,7 @@ xfs_alert_fsblock_zero( (unsigned long long)imap->br_startoff, (unsigned long long)imap->br_blockcount, imap->br_state); - return EFSCORRUPTED; + return -EFSCORRUPTED; } int @@ -138,7 +138,7 @@ xfs_iomap_write_direct( error = xfs_qm_dqattach(ip, 0); if (error) - return XFS_ERROR(error); + return error; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -148,7 +148,7 @@ xfs_iomap_write_direct( if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - return XFS_ERROR(error); + return error; } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -188,7 +188,7 @@ xfs_iomap_write_direct( */ if (error) { xfs_trans_cancel(tp, 0); - return XFS_ERROR(error); + return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -225,7 +225,7 @@ xfs_iomap_write_direct( * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto out_unlock; } @@ -397,7 +397,8 @@ xfs_quota_calc_throttle( struct xfs_inode *ip, int type, xfs_fsblock_t *qblocks, - int *qshift) + int *qshift, + int64_t *qfreesp) { int64_t freesp; int shift = 0; @@ -406,6 +407,7 @@ xfs_quota_calc_throttle( /* over hi wmark, squash the prealloc completely */ if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { *qblocks = 0; + *qfreesp = 0; return; } @@ -418,6 +420,9 @@ xfs_quota_calc_throttle( shift += 2; } + if (freesp < *qfreesp) + *qfreesp = freesp; + /* only overwrite the throttle values if we are more aggressive */ if ((freesp >> shift) < (*qblocks >> *qshift)) { *qblocks = freesp; @@ -476,15 +481,18 @@ xfs_iomap_prealloc_size( } /* - * Check each quota to cap the prealloc size and provide a shift - * value to throttle with. + * Check each quota to cap the prealloc size, provide a shift value to + * throttle with and adjust amount of available space. */ if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + &freesp); if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + &freesp); if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + &freesp); /* * The final prealloc size is set to the minimum of free space available @@ -552,7 +560,7 @@ xfs_iomap_write_delay( */ error = xfs_qm_dqattach_locked(ip, 0); if (error) - return XFS_ERROR(error); + return error; extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -596,11 +604,11 @@ retry: imap, &nimaps, XFS_BMAPI_ENTIRE); switch (error) { case 0: - case ENOSPC: - case EDQUOT: + case -ENOSPC: + case -EDQUOT: break; default: - return XFS_ERROR(error); + return error; } /* @@ -614,7 +622,7 @@ retry: error = 0; goto retry; } - return XFS_ERROR(error ? error : ENOSPC); + return error ? error : -ENOSPC; } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) @@ -663,7 +671,7 @@ xfs_iomap_write_allocate( */ error = xfs_qm_dqattach(ip, 0); if (error) - return XFS_ERROR(error); + return error; offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = imap->br_blockcount; @@ -690,7 +698,7 @@ xfs_iomap_write_allocate( nres, 0); if (error) { xfs_trans_cancel(tp, 0); - return XFS_ERROR(error); + return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -739,7 +747,7 @@ xfs_iomap_write_allocate( if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; if (count_fsb == 0) { - error = EAGAIN; + error = -EAGAIN; goto trans_cancel; } } @@ -793,7 +801,7 @@ trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return XFS_ERROR(error); + return error; } int @@ -853,7 +861,7 @@ xfs_iomap_write_unwritten( resblks, 0); if (error) { xfs_trans_cancel(tp, 0); - return XFS_ERROR(error); + return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -892,7 +900,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - return XFS_ERROR(error); + return error; if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap); @@ -915,5 +923,5 @@ error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return XFS_ERROR(error); + return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 205613a06068..72129493e9d3 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -72,7 +72,7 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = -xfs_attr_set(ip, xattr->name, xattr->value, + error = xfs_attr_set(ip, xattr->name, xattr->value, xattr->value_len, ATTR_SECURE); if (error < 0) break; @@ -93,7 +93,7 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return -security_inode_init_security(inode, dir, qstr, + return security_inode_init_security(inode, dir, qstr, &xfs_initxattrs, NULL); } @@ -173,12 +173,12 @@ xfs_generic_create( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } @@ -194,7 +194,7 @@ xfs_generic_create( posix_acl_release(default_acl); if (acl) posix_acl_release(acl); - return -error; + return error; out_cleanup_inode: if (!tmpfile) @@ -248,8 +248,8 @@ xfs_vn_lookup( xfs_dentry_to_name(&name, dentry, 0); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { - if (unlikely(error != ENOENT)) - return ERR_PTR(-error); + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); d_add(dentry, NULL); return NULL; } @@ -275,8 +275,8 @@ xfs_vn_ci_lookup( xfs_dentry_to_name(&xname, dentry, 0); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { - if (unlikely(error != ENOENT)) - return ERR_PTR(-error); + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); /* * call d_add(dentry, NULL) here when d_drop_negative_children * is called in xfs_vn_mknod (ie. allow negative dentries @@ -311,7 +311,7 @@ xfs_vn_link( error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) - return -error; + return error; ihold(inode); d_instantiate(dentry, inode); @@ -328,7 +328,7 @@ xfs_vn_unlink( xfs_dentry_to_name(&name, dentry, 0); - error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); if (error) return error; @@ -375,7 +375,7 @@ xfs_vn_symlink( xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: - return -error; + return error; } STATIC int @@ -392,8 +392,8 @@ xfs_vn_rename( xfs_dentry_to_name(&oname, odentry, 0); xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); - return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), - XFS_I(ndir), &nname, new_inode ? + return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL); } @@ -414,7 +414,7 @@ xfs_vn_follow_link( if (!link) goto out_err; - error = -xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(dentry->d_inode), link); if (unlikely(error)) goto out_kfree; @@ -441,7 +441,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + return -EIO; stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; @@ -546,14 +546,14 @@ xfs_setattr_nonsize( /* If acls are being inherited, we already have this checked */ if (!(flags & XFS_ATTR_NOACL)) { if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); + return -EROFS; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; - error = -inode_change_ok(inode, iattr); + error = inode_change_ok(inode, iattr); if (error) - return XFS_ERROR(error); + return error; } ASSERT((mask & ATTR_SIZE) == 0); @@ -703,7 +703,7 @@ xfs_setattr_nonsize( xfs_qm_dqrele(gdqp); if (error) - return XFS_ERROR(error); + return error; /* * XXX(hch): Updating the ACL entries is not atomic vs the i_mode @@ -713,9 +713,9 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(inode, inode->i_mode); if (error) - return XFS_ERROR(error); + return error; } return 0; @@ -748,14 +748,14 @@ xfs_setattr_size( trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); + return -EROFS; if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; - error = -inode_change_ok(inode, iattr); + error = inode_change_ok(inode, iattr); if (error) - return XFS_ERROR(error); + return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); @@ -818,7 +818,7 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; @@ -844,7 +844,7 @@ xfs_setattr_size( * much we can do about this, except to hope that the caller sees ENOMEM * and retries the truncate operation. */ - error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); @@ -950,7 +950,7 @@ xfs_vn_setattr( error = xfs_setattr_nonsize(ip, iattr, 0); } - return -error; + return error; } STATIC int @@ -970,7 +970,7 @@ xfs_vn_update_time( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { xfs_trans_cancel(tp, 0); - return -error; + return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -991,7 +991,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return -xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp, 0); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1036,7 +1036,7 @@ xfs_fiemap_format( *full = 1; /* user array now full */ } - return -error; + return error; } STATIC int @@ -1055,12 +1055,12 @@ xfs_vn_fiemap( return error; /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBB(start); + bm.bmv_offset = BTOBBT(start); /* Special case for whole file */ if (length == FIEMAP_MAX_OFFSET) bm.bmv_length = -1LL; else - bm.bmv_length = BTOBB(length); + bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; /* We add one because in getbmap world count includes the header */ bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : @@ -1075,7 +1075,7 @@ xfs_vn_fiemap( error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); if (error) - return -error; + return error; return 0; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index cb64f222d607..f71be9c68017 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -67,19 +67,17 @@ xfs_bulkstat_one_int( *stat = BULKSTAT_RV_NOTHING; if (!buffer || xfs_internal_inum(mp, ino)) - return XFS_ERROR(EINVAL); + return -EINVAL; buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); if (!buf) - return XFS_ERROR(ENOMEM); + return -ENOMEM; error = xfs_iget(mp, NULL, ino, (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), XFS_ILOCK_SHARED, &ip); - if (error) { - *stat = BULKSTAT_RV_NOTHING; + if (error) goto out_free; - } ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); @@ -136,7 +134,6 @@ xfs_bulkstat_one_int( IRELE(ip); error = formatter(buffer, ubsize, ubused, buf); - if (!error) *stat = BULKSTAT_RV_DIDONE; @@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt( const xfs_bstat_t *buffer) { if (ubsize < sizeof(*buffer)) - return XFS_ERROR(ENOMEM); + return -ENOMEM; if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) - return XFS_ERROR(EFAULT); + return -EFAULT; if (ubused) *ubused = sizeof(*buffer); return 0; @@ -175,9 +172,170 @@ xfs_bulkstat_one( xfs_bulkstat_one_fmt, ubused, stat); } +/* + * Loop over all clusters in a chunk for a given incore inode allocation btree + * record. Do a readahead if there are any allocated inodes in that cluster. + */ +STATIC void +xfs_bulkstat_ichunk_ra( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec) +{ + xfs_agblock_t agbno; + struct blk_plug plug; + int blks_per_cluster; + int inodes_per_cluster; + int i; /* inode chunk index */ + + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + + blk_start_plug(&plug); + for (i = 0; i < XFS_INODES_PER_CHUNK; + i += inodes_per_cluster, agbno += blks_per_cluster) { + if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster, + &xfs_inode_buf_ops); + } + } + blk_finish_plug(&plug); +} + +/* + * Lookup the inode chunk that the given inode lives in and then get the record + * if we found the chunk. If the inode was not the last in the chunk and there + * are some left allocated, update the data for the pointed-to record as well as + * return the count of grabbed inodes. + */ +STATIC int +xfs_bulkstat_grab_ichunk( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t agino, /* starting inode of chunk */ + int *icount,/* return # of inodes grabbed */ + struct xfs_inobt_rec_incore *irec) /* btree record */ +{ + int idx; /* index into inode chunk */ + int stat; + int error = 0; + + /* Lookup the inode chunk that this inode lives in */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + *icount = 0; + return error; + } + + /* Get the record, should always work */ + error = xfs_inobt_get_rec(cur, irec, &stat); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(stat == 1); + + /* Check if the record contains the inode in request */ + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) + return -EINVAL; + + idx = agino - irec->ir_startino + 1; + if (idx < XFS_INODES_PER_CHUNK && + (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) { + int i; + + /* We got a right chunk with some left inodes allocated at it. + * Grab the chunk record. Mark all the uninteresting inodes + * free -- because they're before our start point. + */ + for (i = 0; i < idx; i++) { + if (XFS_INOBT_MASK(i) & ~irec->ir_free) + irec->ir_freecount++; + } + + irec->ir_free |= xfs_inobt_maskn(0, idx); + *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + } + + return 0; +} + #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) /* + * Process inodes in chunk with a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + */ +int +xfs_bulkstat_ag_ichunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irbp, + bulkstat_one_pf formatter, + size_t statstruct_size, + struct xfs_bulkstat_agichunk *acp) +{ + xfs_ino_t lastino = acp->ac_lastino; + char __user **ubufp = acp->ac_ubuffer; + int ubleft = acp->ac_ubleft; + int ubelem = acp->ac_ubelem; + int chunkidx, clustidx; + int error = 0; + xfs_agino_t agino; + + for (agino = irbp->ir_startino, chunkidx = clustidx = 0; + XFS_BULKSTAT_UBLEFT(ubleft) && + irbp->ir_freecount < XFS_INODES_PER_CHUNK; + chunkidx++, clustidx++, agino++) { + int fmterror; /* bulkstat formatter result */ + int ubused; + xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino); + + ASSERT(chunkidx < XFS_INODES_PER_CHUNK); + + /* Skip if this inode is free */ + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { + lastino = ino; + continue; + } + + /* + * Count used inodes as free so we can tell when the + * chunk is used up. + */ + irbp->ir_freecount++; + + /* Get the inode and fill in a single buffer */ + ubused = statstruct_size; + error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_NOTHING) { + if (error && error != -ENOENT && error != -EINVAL) { + ubleft = 0; + break; + } + lastino = ino; + continue; + } + if (fmterror == BULKSTAT_RV_GIVEUP) { + ubleft = 0; + ASSERT(error); + break; + } + if (*ubufp) + *ubufp += ubused; + ubleft -= ubused; + ubelem++; + lastino = ino; + } + + acp->ac_lastino = lastino; + acp->ac_ubleft = ubleft; + acp->ac_ubelem = ubelem; + + return error; +} + +/* * Return stat information in bulk (by-inode) for the filesystem. */ int /* error status */ @@ -190,13 +348,10 @@ xfs_bulkstat( char __user *ubuffer, /* buffer with inode stats */ int *done) /* 1 if there are more stats to get */ { - xfs_agblock_t agbno=0;/* allocation group block number */ xfs_buf_t *agbp; /* agi header buffer */ xfs_agi_t *agi; /* agi header data */ xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ - int chunkidx; /* current index into inode chunk */ - int clustidx; /* current index into inode cluster */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ @@ -209,8 +364,6 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ - int blks_per_cluster; /* # of blocks per cluster */ - int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -218,7 +371,6 @@ xfs_bulkstat( int ubleft; /* bytes left in user's buffer */ char __user *ubufp; /* pointer into user's buffer */ int ubelem; /* spaces used in user's buffer */ - int ubused; /* bytes used by formatter */ /* * Get the last inode value, see if there's nothing to do. @@ -233,20 +385,16 @@ xfs_bulkstat( *ubcountp = 0; return 0; } - if (!ubcountp || *ubcountp <= 0) { - return EINVAL; - } + ubcount = *ubcountp; /* statstruct's */ ubleft = ubcount * statstruct_size; /* bytes */ *ubcountp = ubelem = 0; *done = 0; fmterror = 0; ubufp = ubuffer; - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) - return ENOMEM; + return -ENOMEM; nirbuf = irbsize / sizeof(*irbuf); @@ -258,14 +406,8 @@ xfs_bulkstat( while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - if (error) { - /* - * Skip this allocation group and go to the next one. - */ - agno++; - agino = 0; - continue; - } + if (error) + break; agi = XFS_BUF_TO_AGI(agbp); /* * Allocate and initialize a btree cursor for ialloc btree. @@ -275,96 +417,39 @@ xfs_bulkstat( irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; - /* - * If we're returning in the middle of an allocation group, - * we need to get the remainder of the chunk we're in. - */ + icount = 0; if (agino > 0) { - xfs_inobt_rec_incore_t r; - /* - * Lookup the inode chunk that this inode lives in. + * In the middle of an allocation group, we need to get + * the remainder of the chunk we're in. */ - error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, - &tmp); - if (!error && /* no I/O error */ - tmp && /* lookup succeeded */ - /* got the record, should always work */ - !(error = xfs_inobt_get_rec(cur, &r, &i)) && - i == 1 && - /* this is the right chunk */ - agino < r.ir_startino + XFS_INODES_PER_CHUNK && - /* lastino was not last in chunk */ - (chunkidx = agino - r.ir_startino + 1) < - XFS_INODES_PER_CHUNK && - /* there are some left allocated */ - xfs_inobt_maskn(chunkidx, - XFS_INODES_PER_CHUNK - chunkidx) & - ~r.ir_free) { - /* - * Grab the chunk record. Mark all the - * uninteresting inodes (because they're - * before our start point) free. - */ - for (i = 0; i < chunkidx; i++) { - if (XFS_INOBT_MASK(i) & ~r.ir_free) - r.ir_freecount++; - } - r.ir_free |= xfs_inobt_maskn(0, chunkidx); + struct xfs_inobt_rec_incore r; + + error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); + if (error) + break; + if (icount) { irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; agino = r.ir_startino + XFS_INODES_PER_CHUNK; - icount = XFS_INODES_PER_CHUNK - r.ir_freecount; - } else { - /* - * If any of those tests failed, bump the - * inode number (just in case). - */ - agino++; - icount = 0; } - /* - * In any case, increment to the next record. - */ - if (!error) - error = xfs_btree_increment(cur, 0, &tmp); + /* Increment to the next record */ + error = xfs_btree_increment(cur, 0, &tmp); } else { - /* - * Start of ag. Lookup the first inode chunk. - */ + /* Start of ag. Lookup the first inode chunk */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); - icount = 0; } + if (error) + break; + /* * Loop through inode btree records in this ag, * until we run out of inodes or space in the buffer. */ while (irbp < irbufend && icount < ubcount) { - xfs_inobt_rec_incore_t r; - - /* - * Loop as long as we're unable to read the - * inode btree. - */ - while (error) { - agino += XFS_INODES_PER_CHUNK; - if (XFS_AGINO_TO_AGBNO(mp, agino) >= - be32_to_cpu(agi->agi_length)) - break; - error = xfs_inobt_lookup(cur, agino, - XFS_LOOKUP_GE, &tmp); - cond_resched(); - } - /* - * If ran off the end of the ag either with an error, - * or the normal way, set end and stop collecting. - */ - if (error) { - end_of_ag = 1; - break; - } + struct xfs_inobt_rec_incore r; error = xfs_inobt_get_rec(cur, &r, &i); if (error || i == 0) { @@ -377,25 +462,7 @@ xfs_bulkstat( * Also start read-ahead now for this chunk. */ if (r.ir_freecount < XFS_INODES_PER_CHUNK) { - struct blk_plug plug; - /* - * Loop over all clusters in the next chunk. - * Do a readahead if there are any allocated - * inodes in that cluster. - */ - blk_start_plug(&plug); - agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); - for (chunkidx = 0; - chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += inodes_per_cluster, - agbno += blks_per_cluster) { - if (xfs_inobt_maskn(chunkidx, - inodes_per_cluster) & ~r.ir_free) - xfs_btree_reada_bufs(mp, agno, - agbno, blks_per_cluster, - &xfs_inode_buf_ops); - } - blk_finish_plug(&plug); + xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; @@ -422,57 +489,20 @@ xfs_bulkstat( irbufend = irbp; for (irbp = irbuf; irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { - /* - * Now process this chunk of inodes. - */ - for (agino = irbp->ir_startino, chunkidx = clustidx = 0; - XFS_BULKSTAT_UBLEFT(ubleft) && - irbp->ir_freecount < XFS_INODES_PER_CHUNK; - chunkidx++, clustidx++, agino++) { - ASSERT(chunkidx < XFS_INODES_PER_CHUNK); - - ino = XFS_AGINO_TO_INO(mp, agno, agino); - /* - * Skip if this inode is free. - */ - if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { - lastino = ino; - continue; - } - /* - * Count used inodes as free so we can tell - * when the chunk is used up. - */ - irbp->ir_freecount++; - - /* - * Get the inode and fill in a single buffer. - */ - ubused = statstruct_size; - error = formatter(mp, ino, ubufp, ubleft, - &ubused, &fmterror); - if (fmterror == BULKSTAT_RV_NOTHING) { - if (error && error != ENOENT && - error != EINVAL) { - ubleft = 0; - rval = error; - break; - } - lastino = ino; - continue; - } - if (fmterror == BULKSTAT_RV_GIVEUP) { - ubleft = 0; - ASSERT(error); - rval = error; - break; - } - if (ubufp) - ubufp += ubused; - ubleft -= ubused; - ubelem++; - lastino = ino; - } + struct xfs_bulkstat_agichunk ac; + + ac.ac_lastino = lastino; + ac.ac_ubuffer = &ubuffer; + ac.ac_ubleft = ubleft; + ac.ac_ubelem = ubelem; + error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, + formatter, statstruct_size, &ac); + if (error) + rval = error; + + lastino = ac.ac_lastino; + ubleft = ac.ac_ubleft; + ubelem = ac.ac_ubelem; cond_resched(); } @@ -512,58 +542,10 @@ xfs_bulkstat( return rval; } -/* - * Return stat information in bulk (by-inode) for the filesystem. - * Special case for non-sequential one inode bulkstat. - */ -int /* error status */ -xfs_bulkstat_single( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t *lastinop, /* inode to return */ - char __user *buffer, /* buffer with inode stats */ - int *done) /* 1 if there are more stats to get */ -{ - int count; /* count value for bulkstat call */ - int error; /* return value */ - xfs_ino_t ino; /* filesystem inode number */ - int res; /* result from bs1 */ - - /* - * note that requesting valid inode numbers which are not allocated - * to inodes will most likely cause xfs_imap_to_bp to generate warning - * messages about bad magic numbers. This is ok. The fact that - * the inode isn't actually an inode is handled by the - * error check below. Done this way to make the usual case faster - * at the expense of the error case. - */ - - ino = *lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), - NULL, &res); - if (error) { - /* - * Special case way failed, do it the "long" way - * to see if that works. - */ - (*lastinop)--; - count = 1; - if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, - sizeof(xfs_bstat_t), buffer, done)) - return error; - if (count == 0 || (xfs_ino_t)*lastinop != ino) - return error == EFSCORRUPTED ? - XFS_ERROR(EINVAL) : error; - else - return 0; - } - *done = 0; - return 0; -} - int xfs_inumbers_fmt( void __user *ubuffer, /* buffer to write to */ - const xfs_inogrp_t *buffer, /* buffer to read from */ + const struct xfs_inogrp *buffer, /* buffer to read from */ long count, /* # of elements to read */ long *written) /* # of bytes written */ { @@ -578,127 +560,104 @@ xfs_inumbers_fmt( */ int /* error status */ xfs_inumbers( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t *lastino, /* last inode returned */ - int *count, /* size of buffer/count returned */ - void __user *ubuffer,/* buffer with inode descriptions */ - inumbers_fmt_pf formatter) + struct xfs_mount *mp,/* mount point for filesystem */ + xfs_ino_t *lastino,/* last inode returned */ + int *count,/* size of buffer/count returned */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) { - xfs_buf_t *agbp; - xfs_agino_t agino; - xfs_agnumber_t agno; - int bcount; - xfs_inogrp_t *buffer; - int bufidx; - xfs_btree_cur_t *cur; - int error; - xfs_inobt_rec_incore_t r; - int i; - xfs_ino_t ino; - int left; - int tmp; - - ino = (xfs_ino_t)*lastino; - agno = XFS_INO_TO_AGNO(mp, ino); - agino = XFS_INO_TO_AGINO(mp, ino); - left = *count; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino); + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agbp = NULL; + struct xfs_inogrp *buffer; + int bcount; + int left = *count; + int bufidx = 0; + int error = 0; + *count = 0; + if (agno >= mp->m_sb.sb_agcount || + *lastino != XFS_AGINO_TO_INO(mp, agno, agino)) + return error; + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); - error = bufidx = 0; - cur = NULL; - agbp = NULL; - while (left > 0 && agno < mp->m_sb.sb_agcount) { - if (agbp == NULL) { + do { + struct xfs_inobt_rec_incore r; + int stat; + + if (!agbp) { error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - if (error) { - /* - * If we can't read the AGI of this ag, - * then just skip to the next one. - */ - ASSERT(cur == NULL); - agbp = NULL; - agno++; - agino = 0; - continue; - } + if (error) + break; + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, - &tmp); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - cur = NULL; - xfs_buf_relse(agbp); - agbp = NULL; - /* - * Move up the last inode in the current - * chunk. The lookup_ge will always get - * us the first inode in the next chunk. - */ - agino += XFS_INODES_PER_CHUNK - 1; - continue; - } - } - error = xfs_inobt_get_rec(cur, &r, &i); - if (error || i == 0) { - xfs_buf_relse(agbp); - agbp = NULL; - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - cur = NULL; - agno++; - agino = 0; - continue; + &stat); + if (error) + break; + if (!stat) + goto next_ag; } + + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error) + break; + if (!stat) + goto next_ag; + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; - bufidx++; - left--; - if (bufidx == bcount) { - long written; - if (formatter(ubuffer, buffer, bufidx, &written)) { - error = XFS_ERROR(EFAULT); + if (++bufidx == bcount) { + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (error) break; - } ubuffer += written; *count += bufidx; bufidx = 0; } - if (left) { - error = xfs_btree_increment(cur, 0, &tmp); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - cur = NULL; - xfs_buf_relse(agbp); - agbp = NULL; - /* - * The agino value has already been bumped. - * Just try to skip up to it. - */ - agino += XFS_INODES_PER_CHUNK; - continue; - } - } - } + if (!--left) + break; + + error = xfs_btree_increment(cur, 0, &stat); + if (error) + break; + if (stat) + continue; + +next_ag: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + agino = 0; + } while (++agno < mp->m_sb.sb_agcount); + if (!error) { if (bufidx) { - long written; - if (formatter(ubuffer, buffer, bufidx, &written)) - error = XFS_ERROR(EFAULT); - else + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (!error) *count += bufidx; } *lastino = XFS_AGINO_TO_INO(mp, agno, agino); } + kmem_free(buffer); if (cur) xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR)); if (agbp) xfs_buf_relse(agbp); + return error; } diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 97295d91d170..aaed08022eb9 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -30,6 +30,22 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, int *ubused, int *stat); +struct xfs_bulkstat_agichunk { + xfs_ino_t ac_lastino; /* last inode returned */ + char __user **ac_ubuffer;/* pointer into user's buffer */ + int ac_ubleft; /* bytes left in user's buffer */ + int ac_ubelem; /* spaces used in user's buffer */ +}; + +int +xfs_bulkstat_ag_ichunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irbp, + bulkstat_one_pf formatter, + size_t statstruct_size, + struct xfs_bulkstat_agichunk *acp); + /* * Values for stat return value. */ @@ -50,13 +66,6 @@ xfs_bulkstat( char __user *ubuffer,/* buffer with inode stats */ int *done); /* 1 if there are more stats to get */ -int -xfs_bulkstat_single( - xfs_mount_t *mp, - xfs_ino_t *lastinop, - char __user *buffer, - int *done); - typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ void __user *ubuffer, /* buffer to write to */ int ubsize, /* remaining user buffer sz */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 825249d2dfc1..d10dc8f397c9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -21,18 +21,6 @@ #include <linux/types.h> /* - * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. - * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. - */ -#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) -# define XFS_BIG_BLKNOS 1 -# define XFS_BIG_INUMS 1 -#else -# define XFS_BIG_BLKNOS 0 -# define XFS_BIG_INUMS 0 -#endif - -/* * Kernel specific type declarations for XFS */ typedef signed char __int8_t; @@ -113,7 +101,7 @@ typedef __uint64_t __psunsigned_t; #include <asm/byteorder.h> #include <asm/unaligned.h> -#include "xfs_vnode.h" +#include "xfs_fs.h" #include "xfs_stats.h" #include "xfs_sysctl.h" #include "xfs_iops.h" @@ -191,6 +179,17 @@ typedef __uint64_t __psunsigned_t; #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +/* + * XFS wrapper structure for sysfs support. It depends on external data + * structures and is embedded in various internal data structures to implement + * the XFS sysfs object heirarchy. Define it here for broad access throughout + * the codebase. + */ +struct xfs_kobj { + struct kobject kobject; + struct completion complete; +}; + /* Kernel uid/gid conversion. These are used to convert to/from the on disk * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. * The conversion here is type only, the value will remain the same since we @@ -331,7 +330,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) { x += y - 1; do_div(x, y); - return(x * y); + return x * y; } static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 292308dede6d..ca4fd5bd8522 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,7 @@ #include "xfs_trace.h" #include "xfs_fsops.h" #include "xfs_cksum.h" +#include "xfs_sysfs.h" kmem_zone_t *xfs_log_ticket_zone; @@ -283,7 +284,7 @@ xlog_grant_head_wait( return 0; shutdown: list_del_init(&tic->t_queue); - return XFS_ERROR(EIO); + return -EIO; } /* @@ -377,7 +378,7 @@ xfs_log_regrant( int error = 0; if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); + return -EIO; XFS_STATS_INC(xs_try_logspace); @@ -446,7 +447,7 @@ xfs_log_reserve( ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); + return -EIO; XFS_STATS_INC(xs_try_logspace); @@ -454,7 +455,7 @@ xfs_log_reserve( tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, KM_SLEEP | KM_MAYFAIL); if (!tic) - return XFS_ERROR(ENOMEM); + return -ENOMEM; tic->t_trans_type = t_type; *ticp = tic; @@ -590,7 +591,7 @@ xfs_log_release_iclog( { if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return EIO; + return -EIO; } return 0; @@ -628,7 +629,7 @@ xfs_log_mount( mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); if (IS_ERR(mp->m_log)) { - error = -PTR_ERR(mp->m_log); + error = PTR_ERR(mp->m_log); goto out; } @@ -652,18 +653,18 @@ xfs_log_mount( xfs_warn(mp, "Log size %d blocks too small, minimum size is %d blocks", mp->m_sb.sb_logblocks, min_logfsbs); - error = EINVAL; + error = -EINVAL; } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { xfs_warn(mp, "Log size %d blocks too large, maximum size is %lld blocks", mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); - error = EINVAL; + error = -EINVAL; } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { xfs_warn(mp, "log size %lld bytes too large, maximum size is %lld bytes", XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), XFS_MAX_LOG_BYTES); - error = EINVAL; + error = -EINVAL; } if (error) { if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -707,6 +708,11 @@ xfs_log_mount( } } + error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + "log"); + if (error) + goto out_destroy_ail; + /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; @@ -947,6 +953,9 @@ xfs_log_unmount( xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); + + xfs_sysfs_del(&mp->m_log->l_kobj); + xlog_dealloc_log(mp->m_log); } @@ -1313,7 +1322,7 @@ xlog_alloc_log( xlog_in_core_t *iclog, *prev_iclog=NULL; xfs_buf_t *bp; int i; - int error = ENOMEM; + int error = -ENOMEM; uint log2_size = 0; log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); @@ -1340,7 +1349,7 @@ xlog_alloc_log( xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { @@ -1369,8 +1378,14 @@ xlog_alloc_log( xlog_get_iclog_buffer_size(mp, log); - error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); + /* + * Use a NULL block for the extra log buffer used during splits so that + * it will trigger errors if we ever try to do IO on it without first + * having set it up properly. + */ + error = -ENOMEM; + bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; @@ -1463,7 +1478,7 @@ out_free_iclog: out_free_log: kmem_free(log); out: - return ERR_PTR(-error); + return ERR_PTR(error); } /* xlog_alloc_log */ @@ -1661,7 +1676,7 @@ xlog_bdstrat( xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); /* @@ -2360,7 +2375,7 @@ xlog_write( ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); if (!ophdr) - return XFS_ERROR(EIO); + return -EIO; xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); @@ -2859,7 +2874,7 @@ restart: spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } iclog = log->l_iclog; @@ -3047,7 +3062,7 @@ xlog_state_release_iclog( int sync = 0; /* do we sync? */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; ASSERT(atomic_read(&iclog->ic_refcnt) > 0); if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) @@ -3055,7 +3070,7 @@ xlog_state_release_iclog( if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); @@ -3172,7 +3187,7 @@ _xfs_log_force( iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } /* If the head iclog is not active nor dirty, we just attach @@ -3210,7 +3225,7 @@ _xfs_log_force( spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; @@ -3246,7 +3261,7 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } XFS_STATS_INC(xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); @@ -3256,7 +3271,7 @@ maybe_sleep: * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; } else { @@ -3324,7 +3339,7 @@ try_again: iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } do { @@ -3375,7 +3390,7 @@ try_again: xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; spin_lock(&log->l_icloglock); @@ -3390,7 +3405,7 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } XFS_STATS_INC(xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); @@ -3400,7 +3415,7 @@ try_again: * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b3425b34e3d5..f6b79e5325dd 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -78,8 +78,6 @@ xlog_cil_init_post_recovery( { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; - log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, - log->l_curr_block); } /* @@ -634,7 +632,7 @@ out_abort_free_ticket: xfs_log_ticket_put(tic); out_abort: xlog_cil_committed(ctx, XFS_LI_ABORTED); - return XFS_ERROR(EIO); + return -EIO; } static void @@ -928,12 +926,12 @@ xlog_cil_init( cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) - return ENOMEM; + return -ENOMEM; ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); if (!ctx) { kmem_free(cil); - return ENOMEM; + return -ENOMEM; } INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9bc403a9e54f..db7cbdeb2b42 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -405,6 +405,8 @@ struct xlog { struct xlog_grant_head l_reserve_head; struct xlog_grant_head l_write_head; + struct xfs_kobj l_kobj; + /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG char *l_iclog_bak[XLOG_MAX_ICLOGS]; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 981af0f6504b..1fd5787add99 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -179,7 +179,7 @@ xlog_bread_noalign( xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); - return EFSCORRUPTED; + return -EFSCORRUPTED; } blk_no = round_down(blk_no, log->l_sectBBsize); @@ -194,7 +194,7 @@ xlog_bread_noalign( bp->b_error = 0; if (XFS_FORCED_SHUTDOWN(log->l_mp)) - return XFS_ERROR(EIO); + return -EIO; xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); @@ -268,7 +268,7 @@ xlog_bwrite( xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); - return EFSCORRUPTED; + return -EFSCORRUPTED; } blk_no = round_down(blk_no, log->l_sectBBsize); @@ -330,14 +330,14 @@ xlog_header_check_recover( xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(1)", XFS_ERRLEVEL_HIGH, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(2)", XFS_ERRLEVEL_HIGH, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -364,7 +364,7 @@ xlog_header_check_mount( xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_mount", XFS_ERRLEVEL_HIGH, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -462,7 +462,7 @@ xlog_find_verify_cycle( while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) - return ENOMEM; + return -ENOMEM; } for (i = start_blk; i < start_blk + nbblks; i += bufblks) { @@ -524,7 +524,7 @@ xlog_find_verify_log_record( if (!(bp = xlog_get_bp(log, num_blks))) { if (!(bp = xlog_get_bp(log, 1))) - return ENOMEM; + return -ENOMEM; smallmem = 1; } else { error = xlog_bread(log, start_blk, num_blks, bp, &offset); @@ -539,7 +539,7 @@ xlog_find_verify_log_record( xfs_warn(log->l_mp, "Log inconsistent (didn't find previous header)"); ASSERT(0); - error = XFS_ERROR(EIO); + error = -EIO; goto out; } @@ -564,7 +564,7 @@ xlog_find_verify_log_record( * will be called again for the end of the physical log. */ if (i == -1) { - error = -1; + error = 1; goto out; } @@ -628,7 +628,12 @@ xlog_find_head( int error, log_bbnum = log->l_logBBsize; /* Is the end of the log device zeroed? */ - if ((error = xlog_find_zeroed(log, &first_blk)) == -1) { + error = xlog_find_zeroed(log, &first_blk); + if (error < 0) { + xfs_warn(log->l_mp, "empty log check failed"); + return error; + } + if (error == 1) { *return_head_blk = first_blk; /* Is the whole lot zeroed? */ @@ -641,15 +646,12 @@ xlog_find_head( } return 0; - } else if (error) { - xfs_warn(log->l_mp, "empty log check failed"); - return error; } first_blk = 0; /* get cycle # of 1st block */ bp = xlog_get_bp(log, 1); if (!bp) - return ENOMEM; + return -ENOMEM; error = xlog_bread(log, 0, 1, bp, &offset); if (error) @@ -818,29 +820,29 @@ validate_head: start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ /* start ptr at last block ptr before head_blk */ - if ((error = xlog_find_verify_log_record(log, start_blk, - &head_blk, 0)) == -1) { - error = XFS_ERROR(EIO); - goto bp_err; - } else if (error) + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error == 1) + error = -EIO; + if (error) goto bp_err; } else { start_blk = 0; ASSERT(head_blk <= INT_MAX); - if ((error = xlog_find_verify_log_record(log, start_blk, - &head_blk, 0)) == -1) { + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error < 0) + goto bp_err; + if (error == 1) { /* We hit the beginning of the log during our search */ start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); ASSERT(head_blk <= INT_MAX); - if ((error = xlog_find_verify_log_record(log, - start_blk, &new_blk, - (int)head_blk)) == -1) { - error = XFS_ERROR(EIO); - goto bp_err; - } else if (error) + error = xlog_find_verify_log_record(log, start_blk, + &new_blk, (int)head_blk); + if (error == 1) + error = -EIO; + if (error) goto bp_err; if (new_blk != log_bbnum) head_blk = new_blk; @@ -911,7 +913,7 @@ xlog_find_tail( bp = xlog_get_bp(log, 1); if (!bp) - return ENOMEM; + return -ENOMEM; if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) @@ -961,7 +963,7 @@ xlog_find_tail( xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); xlog_put_bp(bp); ASSERT(0); - return XFS_ERROR(EIO); + return -EIO; } /* find blk_no of tail of log */ @@ -1092,8 +1094,8 @@ done: * * Return: * 0 => the log is completely written to - * -1 => use *blk_no as the first block of the log - * >0 => error has occurred + * 1 => use *blk_no as the first block of the log + * <0 => error has occurred */ STATIC int xlog_find_zeroed( @@ -1112,7 +1114,7 @@ xlog_find_zeroed( /* check totally zeroed log */ bp = xlog_get_bp(log, 1); if (!bp) - return ENOMEM; + return -ENOMEM; error = xlog_bread(log, 0, 1, bp, &offset); if (error) goto bp_err; @@ -1121,7 +1123,7 @@ xlog_find_zeroed( if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; xlog_put_bp(bp); - return -1; + return 1; } /* check partially zeroed log */ @@ -1141,7 +1143,7 @@ xlog_find_zeroed( */ xfs_warn(log->l_mp, "Log inconsistent or not a log (last==0, first!=1)"); - error = XFS_ERROR(EINVAL); + error = -EINVAL; goto bp_err; } @@ -1179,19 +1181,18 @@ xlog_find_zeroed( * Potentially backup over partial log record write. We don't need * to search the end of the log because we know it is zero. */ - if ((error = xlog_find_verify_log_record(log, start_blk, - &last_blk, 0)) == -1) { - error = XFS_ERROR(EIO); - goto bp_err; - } else if (error) - goto bp_err; + error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; *blk_no = last_blk; bp_err: xlog_put_bp(bp); if (error) return error; - return -1; + return 1; } /* @@ -1251,7 +1252,7 @@ xlog_write_log_records( while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) - return ENOMEM; + return -ENOMEM; } /* We may need to do a read at the start to fill in part of @@ -1354,7 +1355,7 @@ xlog_clear_stale_blocks( if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } tail_distance = tail_block + (log->l_logBBsize - head_block); } else { @@ -1366,7 +1367,7 @@ xlog_clear_stale_blocks( if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } tail_distance = tail_block - head_block; } @@ -1551,7 +1552,7 @@ xlog_recover_add_to_trans( xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); - return XFS_ERROR(EIO); + return -EIO; } if (len == sizeof(xfs_trans_header_t)) xlog_recover_add_item(&trans->r_itemq); @@ -1581,7 +1582,7 @@ xlog_recover_add_to_trans( in_f->ilf_size); ASSERT(0); kmem_free(ptr); - return XFS_ERROR(EIO); + return -EIO; } item->ri_total = in_f->ilf_size; @@ -1702,7 +1703,7 @@ xlog_recover_reorder_trans( */ if (!list_empty(&sort_list)) list_splice_init(&sort_list, &trans->r_itemq); - error = XFS_ERROR(EIO); + error = -EIO; goto out; } } @@ -1943,7 +1944,7 @@ xlog_recover_do_inode_buffer( item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, @@ -2125,6 +2126,17 @@ xlog_recover_validate_buf_type( __uint16_t magic16; __uint16_t magicda; + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); magic16 = be16_to_cpu(*(__be16*)bp->b_addr); magicda = be16_to_cpu(info->magic); @@ -2162,8 +2174,6 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: - if (!xfs_sb_version_hascrc(&mp->m_sb)) - break; if (magic32 != XFS_AGFL_MAGIC) { xfs_warn(mp, "Bad AGFL block magic!"); ASSERT(0); @@ -2196,10 +2206,6 @@ xlog_recover_validate_buf_type( #endif break; case XFS_BLFT_DINO_BUF: - /* - * we get here with inode allocation buffers, not buffers that - * track unlinked list changes. - */ if (magic16 != XFS_DINODE_MAGIC) { xfs_warn(mp, "Bad INODE block magic!"); ASSERT(0); @@ -2279,8 +2285,6 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: - if (!xfs_sb_version_hascrc(&mp->m_sb)) - break; if (magic32 != XFS_ATTR3_RMT_MAGIC) { xfs_warn(mp, "Bad attr remote magic!"); ASSERT(0); @@ -2387,16 +2391,7 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - /* - * We can only do post recovery validation on items on CRC enabled - * fielsystems as we need to know when the buffer was written to be able - * to determine if we should have replayed the item. If we replay old - * metadata over a newer buffer, then it will enter a temporarily - * inconsistent state resulting in verification failures. Hence for now - * just avoid the verification stage for non-crc filesystems - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - xlog_recover_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* @@ -2404,8 +2399,11 @@ xlog_recover_do_reg_buffer( * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. */ -STATIC void +STATIC bool xlog_recover_do_dquot_buffer( struct xfs_mount *mp, struct xlog *log, @@ -2420,9 +2418,8 @@ xlog_recover_do_dquot_buffer( /* * Filesystems are required to send in quota flags at mount time. */ - if (mp->m_qflags == 0) { - return; - } + if (!mp->m_qflags) + return false; type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) @@ -2435,9 +2432,10 @@ xlog_recover_do_dquot_buffer( * This type of quotas was turned off, so ignore this buffer */ if (log->l_quotaoffs_flag & type) - return; + return false; xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + return true; } /* @@ -2496,7 +2494,7 @@ xlog_recover_buffer_pass2( bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags, NULL); if (!bp) - return XFS_ERROR(ENOMEM); + return -ENOMEM; error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); @@ -2504,23 +2502,44 @@ xlog_recover_buffer_pass2( } /* - * recover the buffer only if we get an LSN from it and it's less than + * Recover the buffer only if we get an LSN from it and it's less than * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. */ lsn = xlog_recover_get_buf_lsn(mp, bp); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + xlog_recover_validate_buf_type(mp, bp, buf_f); goto out_release; + } if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } - if (error) - goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2598,7 +2617,7 @@ xfs_recover_inode_owner_change( ip = xfs_inode_alloc(mp, in_f->ilf_ino); if (!ip) - return ENOMEM; + return -ENOMEM; /* instantiate the inode */ xfs_dinode_from_disk(&ip->i_d, dip); @@ -2676,7 +2695,7 @@ xlog_recover_inode_pass2( bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, &xfs_inode_buf_ops); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error; } error = bp->b_error; @@ -2697,7 +2716,7 @@ xlog_recover_inode_pass2( __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } dicp = item->ri_buf[1].i_addr; @@ -2707,7 +2726,7 @@ xlog_recover_inode_pass2( __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } @@ -2764,7 +2783,7 @@ xlog_recover_inode_pass2( "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { @@ -2777,7 +2796,7 @@ xlog_recover_inode_pass2( "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } } @@ -2790,7 +2809,7 @@ xlog_recover_inode_pass2( __func__, item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { @@ -2800,7 +2819,7 @@ xlog_recover_inode_pass2( "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } isize = xfs_icdinode_size(dicp->di_version); @@ -2810,7 +2829,7 @@ xlog_recover_inode_pass2( xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; goto out_release; } @@ -2898,7 +2917,7 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - error = EIO; + error = -EIO; goto out_release; } } @@ -2919,7 +2938,7 @@ out_release: error: if (need_free) kmem_free(in_f); - return XFS_ERROR(error); + return error; } /* @@ -2946,7 +2965,7 @@ xlog_recover_quotaoff_pass1( if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_GROUP; - return (0); + return 0; } /* @@ -2971,17 +2990,17 @@ xlog_recover_dquot_pass2( * Filesystems are required to send in quota flags at mount time. */ if (mp->m_qflags == 0) - return (0); + return 0; recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return XFS_ERROR(EIO); + return -EIO; } if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); - return XFS_ERROR(EIO); + return -EIO; } /* @@ -2990,7 +3009,7 @@ xlog_recover_dquot_pass2( type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) - return (0); + return 0; /* * At this point we know that quota was _not_ turned off. @@ -3007,12 +3026,19 @@ xlog_recover_dquot_pass2( error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2 (log copy)"); if (error) - return XFS_ERROR(EIO); + return -EIO; ASSERT(dq_f->qlf_len == 1); + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, - NULL); + &xfs_dquot_buf_ops); if (error) return error; @@ -3020,18 +3046,6 @@ xlog_recover_dquot_pass2( ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); /* - * At least the magic num portion should be on disk because this - * was among a chunk of dquots created earlier, and we did some - * minimal initialization then. - */ - error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_dquot_pass2"); - if (error) { - xfs_buf_relse(bp); - return XFS_ERROR(EIO); - } - - /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ @@ -3178,38 +3192,38 @@ xlog_recover_do_icreate_pass2( icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); - return EINVAL; + return -EINVAL; } if (icl->icl_size != 1) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); - return EINVAL; + return -EINVAL; } agno = be32_to_cpu(icl->icl_ag); if (agno >= mp->m_sb.sb_agcount) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); - return EINVAL; + return -EINVAL; } agbno = be32_to_cpu(icl->icl_agbno); if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); - return EINVAL; + return -EINVAL; } isize = be32_to_cpu(icl->icl_isize); if (isize != mp->m_sb.sb_inodesize) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); - return EINVAL; + return -EINVAL; } count = be32_to_cpu(icl->icl_count); if (!count) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); - return EINVAL; + return -EINVAL; } length = be32_to_cpu(icl->icl_length); if (!length || length >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); - return EINVAL; + return -EINVAL; } /* existing allocation is fixed value */ @@ -3218,7 +3232,7 @@ xlog_recover_do_icreate_pass2( if (count != mp->m_ialloc_inos || length != mp->m_ialloc_blks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); - return EINVAL; + return -EINVAL; } /* @@ -3389,7 +3403,7 @@ xlog_recover_commit_pass1( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return XFS_ERROR(EIO); + return -EIO; } } @@ -3425,7 +3439,7 @@ xlog_recover_commit_pass2( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return XFS_ERROR(EIO); + return -EIO; } } @@ -3560,7 +3574,7 @@ xlog_recover_process_data( /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) - return (XFS_ERROR(EIO)); + return -EIO; while ((dp < lp) && num_logops) { ASSERT(dp + sizeof(xlog_op_header_t) <= lp); @@ -3571,7 +3585,7 @@ xlog_recover_process_data( xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); - return (XFS_ERROR(EIO)); + return -EIO; } tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); @@ -3585,7 +3599,7 @@ xlog_recover_process_data( xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, be32_to_cpu(ohead->oh_len)); WARN_ON(1); - return (XFS_ERROR(EIO)); + return -EIO; } flags = ohead->oh_flags & ~XLOG_END_TRANS; if (flags & XLOG_WAS_CONT_TRANS) @@ -3607,7 +3621,7 @@ xlog_recover_process_data( xfs_warn(log->l_mp, "%s: bad transaction", __func__); ASSERT(0); - error = XFS_ERROR(EIO); + error = -EIO; break; case 0: case XLOG_CONTINUE_TRANS: @@ -3618,7 +3632,7 @@ xlog_recover_process_data( xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); - error = XFS_ERROR(EIO); + error = -EIO; break; } if (error) { @@ -3669,7 +3683,7 @@ xlog_recover_process_efi( */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); - return XFS_ERROR(EIO); + return -EIO; } } @@ -3969,7 +3983,7 @@ xlog_unpack_data_crc( * CRC protection by punting an error back up the stack. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) - return EFSCORRUPTED; + return -EFSCORRUPTED; } return 0; @@ -4018,14 +4032,14 @@ xlog_valid_rec_header( if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (unlikely( (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); - return XFS_ERROR(EIO); + return -EIO; } /* LR body must have data or it wouldn't have been written */ @@ -4033,12 +4047,12 @@ xlog_valid_rec_header( if (unlikely( hlen <= 0 || hlen > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(2)", XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(3)", XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } return 0; } @@ -4081,7 +4095,7 @@ xlog_do_recovery_pass( */ hbp = xlog_get_bp(log, 1); if (!hbp) - return ENOMEM; + return -ENOMEM; error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) @@ -4110,11 +4124,11 @@ xlog_do_recovery_pass( } if (!hbp) - return ENOMEM; + return -ENOMEM; dbp = xlog_get_bp(log, BTOBB(h_size)); if (!dbp) { xlog_put_bp(hbp); - return ENOMEM; + return -ENOMEM; } memset(rhash, 0, sizeof(rhash)); @@ -4388,7 +4402,7 @@ xlog_do_recover( * If IO errors happened during recovery, bail out. */ if (XFS_FORCED_SHUTDOWN(log->l_mp)) { - return (EIO); + return -EIO; } /* @@ -4415,7 +4429,7 @@ xlog_do_recover( if (XFS_FORCED_SHUTDOWN(log->l_mp)) { xfs_buf_relse(bp); - return XFS_ERROR(EIO); + return -EIO; } xfs_buf_iorequest(bp); @@ -4492,7 +4506,7 @@ xlog_recover( "Please recover the log on a kernel that supports the unknown features.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); - return EINVAL; + return -EINVAL; } xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3507cd0ec400..fbf0384a466f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_dinode.h" +#include "xfs_sysfs.h" #ifdef HAVE_PERCPU_SB @@ -60,6 +61,8 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; +extern struct kset *xfs_kset; + /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. @@ -76,7 +79,7 @@ xfs_uuid_mount( if (uuid_is_nil(uuid)) { xfs_warn(mp, "Filesystem has nil UUID - can't mount"); - return XFS_ERROR(EINVAL); + return -EINVAL; } mutex_lock(&xfs_uuid_table_mutex); @@ -104,7 +107,7 @@ xfs_uuid_mount( out_duplicate: mutex_unlock(&xfs_uuid_table_mutex); xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); - return XFS_ERROR(EINVAL); + return -EINVAL; } STATIC void @@ -173,13 +176,9 @@ xfs_sb_validate_fsb_count( ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); -#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + /* Limited by ULONG_MAX of page cache index */ if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) - return EFBIG; -#else /* Limited by UINT_MAX of sectors */ - if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) - return EFBIG; -#endif + return -EFBIG; return 0; } @@ -250,9 +249,9 @@ xfs_initialize_perag( mp->m_flags &= ~XFS_MOUNT_32BITINODES; if (mp->m_flags & XFS_MOUNT_32BITINODES) - index = xfs_set_inode32(mp); + index = xfs_set_inode32(mp, agcount); else - index = xfs_set_inode64(mp); + index = xfs_set_inode64(mp, agcount); if (maxagi) *maxagi = index; @@ -308,15 +307,15 @@ reread: if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); - return EIO; + return -EIO; } if (bp->b_error) { error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); /* bad CRC means corrupted metadata */ - if (error == EFSBADCRC) - error = EFSCORRUPTED; + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; goto release_buf; } @@ -324,7 +323,6 @@ reread: * Initialize the mount structure from the superblock. */ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); - xfs_sb_quota_from_disk(sbp); /* * If we haven't validated the superblock, do so now before we try @@ -333,7 +331,7 @@ reread: if (sbp->sb_magicnum != XFS_SB_MAGIC) { if (loud) xfs_warn(mp, "Invalid superblock magic number"); - error = EINVAL; + error = -EINVAL; goto release_buf; } @@ -344,7 +342,7 @@ reread: if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", sector_size, sbp->sb_sectsize); - error = ENOSYS; + error = -ENOSYS; goto release_buf; } @@ -392,7 +390,7 @@ xfs_update_alignment(xfs_mount_t *mp) xfs_warn(mp, "alignment check failed: sunit/swidth vs. blocksize(%d)", sbp->sb_blocksize); - return XFS_ERROR(EINVAL); + return -EINVAL; } else { /* * Convert the stripe unit and width to FSBs. @@ -402,14 +400,14 @@ xfs_update_alignment(xfs_mount_t *mp) xfs_warn(mp, "alignment check failed: sunit/swidth vs. agsize(%d)", sbp->sb_agblocks); - return XFS_ERROR(EINVAL); + return -EINVAL; } else if (mp->m_dalign) { mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, "alignment check failed: sunit(%d) less than bsize(%d)", mp->m_dalign, sbp->sb_blocksize); - return XFS_ERROR(EINVAL); + return -EINVAL; } } @@ -429,7 +427,7 @@ xfs_update_alignment(xfs_mount_t *mp) } else { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); - return XFS_ERROR(EINVAL); + return -EINVAL; } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { @@ -556,14 +554,14 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { xfs_warn(mp, "filesystem size mismatch detected"); - return XFS_ERROR(EFBIG); + return -EFBIG; } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); - return EIO; + return -EIO; } xfs_buf_relse(bp); @@ -571,14 +569,14 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { xfs_warn(mp, "log size mismatch detected"); - return XFS_ERROR(EFBIG); + return -EFBIG; } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); - return EIO; + return -EIO; } xfs_buf_relse(bp); } @@ -731,10 +729,15 @@ xfs_mountfs( xfs_set_maxicount(mp); - error = xfs_uuid_mount(mp); + mp->m_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_sysfs; + /* * Set the minimum read and write sizes */ @@ -816,7 +819,7 @@ xfs_mountfs( if (!sbp->sb_logblocks) { xfs_warn(mp, "no log defined"); XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto out_free_perag; } @@ -855,7 +858,7 @@ xfs_mountfs( !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); if (error) - goto out_fail_wait; + goto out_log_dealloc; } /* @@ -876,7 +879,7 @@ xfs_mountfs( xfs_iunlock(rip, XFS_ILOCK_EXCL); XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto out_rele_rip; } mp->m_rootip = rip; /* save it */ @@ -927,7 +930,7 @@ xfs_mountfs( xfs_notice(mp, "resetting quota flags"); error = xfs_mount_reset_sbqflags(mp); if (error) - return error; + goto out_rtunmount; } } @@ -989,6 +992,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_sysfs: + xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1071,6 +1076,8 @@ xfs_unmountfs( xfs_errortag_clearall(mp, 0); #endif xfs_free_perag(mp); + + xfs_sysfs_del(&mp->m_kobj); } int @@ -1152,7 +1159,7 @@ xfs_mod_incore_sb_unlocked( lcounter += delta; if (lcounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_icount = lcounter; return 0; @@ -1161,7 +1168,7 @@ xfs_mod_incore_sb_unlocked( lcounter += delta; if (lcounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_ifree = lcounter; return 0; @@ -1191,7 +1198,7 @@ xfs_mod_incore_sb_unlocked( * blocks if were allowed to. */ if (!rsvd) - return XFS_ERROR(ENOSPC); + return -ENOSPC; lcounter = (long long)mp->m_resblks_avail + delta; if (lcounter >= 0) { @@ -1202,7 +1209,7 @@ xfs_mod_incore_sb_unlocked( "Filesystem \"%s\": reserve blocks depleted! " "Consider increasing reserve pool size.", mp->m_fsname); - return XFS_ERROR(ENOSPC); + return -ENOSPC; } mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); @@ -1211,7 +1218,7 @@ xfs_mod_incore_sb_unlocked( lcounter = (long long)mp->m_sb.sb_frextents; lcounter += delta; if (lcounter < 0) { - return XFS_ERROR(ENOSPC); + return -ENOSPC; } mp->m_sb.sb_frextents = lcounter; return 0; @@ -1220,7 +1227,7 @@ xfs_mod_incore_sb_unlocked( lcounter += delta; if (lcounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_dblocks = lcounter; return 0; @@ -1229,7 +1236,7 @@ xfs_mod_incore_sb_unlocked( scounter += delta; if (scounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_agcount = scounter; return 0; @@ -1238,7 +1245,7 @@ xfs_mod_incore_sb_unlocked( scounter += delta; if (scounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_imax_pct = scounter; return 0; @@ -1247,7 +1254,7 @@ xfs_mod_incore_sb_unlocked( scounter += delta; if (scounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_rextsize = scounter; return 0; @@ -1256,7 +1263,7 @@ xfs_mod_incore_sb_unlocked( scounter += delta; if (scounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_rbmblocks = scounter; return 0; @@ -1265,7 +1272,7 @@ xfs_mod_incore_sb_unlocked( lcounter += delta; if (lcounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_rblocks = lcounter; return 0; @@ -1274,7 +1281,7 @@ xfs_mod_incore_sb_unlocked( lcounter += delta; if (lcounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_rextents = lcounter; return 0; @@ -1283,13 +1290,13 @@ xfs_mod_incore_sb_unlocked( scounter += delta; if (scounter < 0) { ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_sb.sb_rextslog = scounter; return 0; default: ASSERT(0); - return XFS_ERROR(EINVAL); + return -EINVAL; } } @@ -1452,7 +1459,7 @@ xfs_dev_is_read_only( (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { xfs_notice(mp, "%s required on read-only device.", message); xfs_notice(mp, "write access unavailable, cannot proceed."); - return EROFS; + return -EROFS; } return 0; } @@ -1995,7 +2002,7 @@ slow_path: * (e.g. lots of space just got freed). After that * we are done. */ - if (ret != ENOSPC) + if (ret != -ENOSPC) xfs_icsb_balance_counter(mp, field, 0); xfs_icsb_unlock(mp); return ret; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7295a0b7c343..b0447c86e7e2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -166,6 +166,7 @@ typedef struct xfs_mount { on the next remount,rw */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ + struct xfs_kobj m_kobj; struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f99b4933dc22..1eb6f3df698c 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -337,20 +337,20 @@ xfs_mru_cache_create( *mrup = NULL; if (!mrup || !grp_count || !lifetime_ms || !free_func) - return EINVAL; + return -EINVAL; if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) - return EINVAL; + return -EINVAL; if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) - return ENOMEM; + return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); if (!mru->lists) { - err = ENOMEM; + err = -ENOMEM; goto exit; } @@ -434,16 +434,16 @@ xfs_mru_cache_insert( ASSERT(mru && mru->lists); if (!mru || !mru->lists) - return EINVAL; + return -EINVAL; if (radix_tree_preload(GFP_KERNEL)) - return ENOMEM; + return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); elem->key = key; spin_lock(&mru->lock); - error = -radix_tree_insert(&mru->store, key, elem); + error = radix_tree_insert(&mru->store, key, elem); radix_tree_preload_end(); if (!error) _xfs_mru_cache_list_insert(mru, elem); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6d26759c779a..10232102b4a6 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -98,18 +98,18 @@ restart: next_index = be32_to_cpu(dqp->q_core.d_id) + 1; error = execute(batch[i], data); - if (error == EAGAIN) { + if (error == -EAGAIN) { skipped++; continue; } - if (error && last_error != EFSCORRUPTED) + if (error && last_error != -EFSCORRUPTED) last_error = error; } mutex_unlock(&qi->qi_tree_lock); /* bail out if the filesystem is corrupted. */ - if (last_error == EFSCORRUPTED) { + if (last_error == -EFSCORRUPTED) { skipped = 0; break; } @@ -138,7 +138,7 @@ xfs_qm_dqpurge( xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { xfs_dqunlock(dqp); - return EAGAIN; + return -EAGAIN; } dqp->dq_flags |= XFS_DQ_FREEING; @@ -221,100 +221,6 @@ xfs_qm_unmount( } } - -/* - * This is called from xfs_mountfs to start quotas and initialize all - * necessary data structures like quotainfo. This is also responsible for - * running a quotacheck as necessary. We are guaranteed that the superblock - * is consistently read in at this point. - * - * If we fail here, the mount will continue with quota turned off. We don't - * need to inidicate success or failure at all. - */ -void -xfs_qm_mount_quotas( - xfs_mount_t *mp) -{ - int error = 0; - uint sbf; - - /* - * If quotas on realtime volumes is not supported, we disable - * quotas immediately. - */ - if (mp->m_sb.sb_rextents) { - xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); - mp->m_qflags = 0; - goto write_changes; - } - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - /* - * Allocate the quotainfo structure inside the mount struct, and - * create quotainode(s), and change/rev superblock if necessary. - */ - error = xfs_qm_init_quotainfo(mp); - if (error) { - /* - * We must turn off quotas. - */ - ASSERT(mp->m_quotainfo == NULL); - mp->m_qflags = 0; - goto write_changes; - } - /* - * If any of the quotas are not consistent, do a quotacheck. - */ - if (XFS_QM_NEED_QUOTACHECK(mp)) { - error = xfs_qm_quotacheck(mp); - if (error) { - /* Quotacheck failed and disabled quotas. */ - return; - } - } - /* - * If one type of quotas is off, then it will lose its - * quotachecked status, since we won't be doing accounting for - * that type anymore. - */ - if (!XFS_IS_UQUOTA_ON(mp)) - mp->m_qflags &= ~XFS_UQUOTA_CHKD; - if (!XFS_IS_GQUOTA_ON(mp)) - mp->m_qflags &= ~XFS_GQUOTA_CHKD; - if (!XFS_IS_PQUOTA_ON(mp)) - mp->m_qflags &= ~XFS_PQUOTA_CHKD; - - write_changes: - /* - * We actually don't have to acquire the m_sb_lock at all. - * This can only be called from mount, and that's single threaded. XXX - */ - spin_lock(&mp->m_sb_lock); - sbf = mp->m_sb.sb_qflags; - mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { - if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { - /* - * We could only have been turning quotas off. - * We aren't in very good shape actually because - * the incore structures are convinced that quotas are - * off, but the on disk superblock doesn't know that ! - */ - ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); - xfs_alert(mp, "%s: Superblock update failed!", - __func__); - } - } - - if (error) { - xfs_warn(mp, "Failed to initialize disk quotas."); - return; - } -} - /* * Called from the vfsops layer. */ @@ -671,7 +577,7 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - error = -list_lru_init(&qinf->qi_lru); + error = list_lru_init(&qinf->qi_lru); if (error) goto out_free_qinf; @@ -995,7 +901,7 @@ xfs_qm_dqiter_bufs( * will leave a trace in the log indicating corruption has * been detected. */ - if (error == EFSCORRUPTED) { + if (error == -EFSCORRUPTED) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, @@ -1005,6 +911,12 @@ xfs_qm_dqiter_bufs( if (error) break; + /* + * A corrupt buffer might not have a verifier attached, so + * make sure we have the correct one attached before writeback + * occurs. + */ + bp->b_ops = &xfs_dquot_buf_ops; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); @@ -1090,7 +1002,7 @@ xfs_qm_dqiterate( xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), mp->m_quotainfo->qi_dqchunklen, - NULL); + &xfs_dquot_buf_ops); rablkno++; } } @@ -1138,8 +1050,8 @@ xfs_qm_quotacheck_dqadjust( /* * Shouldn't be able to turn off quotas here. */ - ASSERT(error != ESRCH); - ASSERT(error != ENOENT); + ASSERT(error != -ESRCH); + ASSERT(error != -ENOENT); return error; } @@ -1226,7 +1138,7 @@ xfs_qm_dqusage_adjust( */ if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(EINVAL); + return -EINVAL; } /* @@ -1330,7 +1242,7 @@ out_unlock: * Walk thru all the filesystem inodes and construct a consistent view * of the disk quota world. If the quotacheck fails, disable quotas. */ -int +STATIC int xfs_qm_quotacheck( xfs_mount_t *mp) { @@ -1463,7 +1375,100 @@ xfs_qm_quotacheck( } } else xfs_notice(mp, "Quotacheck: Done."); - return (error); + return error; +} + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. + */ +void +xfs_qm_mount_quotas( + struct xfs_mount *mp) +{ + int error = 0; + uint sbf; + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); + mp->m_qflags = 0; + goto write_changes; + } + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + error = xfs_qm_init_quotainfo(mp); + if (error) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; + } + } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; + + write_changes: + /* + * We actually don't have to acquire the m_sb_lock at all. + * This can only be called from mount, and that's single threaded. XXX + */ + spin_lock(&mp->m_sb_lock); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_alert(mp, "%s: Superblock update failed!", + __func__); + } + } + + if (error) { + xfs_warn(mp, "Failed to initialize disk quotas."); + return; + } } /* @@ -1493,7 +1498,7 @@ xfs_qm_init_quotainos( error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip); if (error) - return XFS_ERROR(error); + return error; } if (XFS_IS_GQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { @@ -1563,7 +1568,7 @@ error_rele: IRELE(gip); if (pip) IRELE(pip); - return XFS_ERROR(error); + return error; } STATIC void @@ -1679,7 +1684,7 @@ xfs_qm_vop_dqalloc( XFS_QMOPT_DOWARN, &uq); if (error) { - ASSERT(error != ENOENT); + ASSERT(error != -ENOENT); return error; } /* @@ -1706,7 +1711,7 @@ xfs_qm_vop_dqalloc( XFS_QMOPT_DOWARN, &gq); if (error) { - ASSERT(error != ENOENT); + ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(gq); @@ -1726,7 +1731,7 @@ xfs_qm_vop_dqalloc( XFS_QMOPT_DOWARN, &pq); if (error) { - ASSERT(error != ENOENT); + ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(pq); @@ -1895,7 +1900,7 @@ xfs_qm_vop_chown_reserve( -((xfs_qcnt_t)delblks), 0, blkflags); } - return (0); + return 0; } int diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 797fd4636273..3a07a937e232 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -157,7 +157,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBWARNLIMIT 5 extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); -extern int xfs_qm_quotacheck(struct xfs_mount *); extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); /* dquot stuff */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index e9be63abd8d2..2c61e61b0205 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -117,7 +117,7 @@ xfs_qm_newmount( (uquotaondisk ? " usrquota" : ""), (gquotaondisk ? " grpquota" : ""), (pquotaondisk ? " prjquota" : "")); - return XFS_ERROR(EPERM); + return -EPERM; } if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index bbc813caba4c..80f2d77d929a 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff( /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ - * errno == EEXIST here. + * errno == -EEXIST here. */ if ((mp->m_qflags & flags) == 0) - return XFS_ERROR(EEXIST); + return -EEXIST; error = 0; flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); @@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff( /* XXX what to do if error ? Revert back to old vals incore ? */ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); - return (error); + return error; } dqtype = 0; @@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff( if (mp->m_qflags == 0) { mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); - return (0); + return 0; } /* @@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error = EINVAL; + int error = -EINVAL; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || (flags & ~XFS_DQ_ALLTYPES)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); - return XFS_ERROR(EINVAL); + return -EINVAL; } if (flags & XFS_DQ_USER) { @@ -328,7 +328,7 @@ xfs_qm_scall_quotaon( if (flags == 0) { xfs_debug(mp, "%s: zero flags, m_qflags=%x", __func__, mp->m_qflags); - return XFS_ERROR(EINVAL); + return -EINVAL; } /* No fs can turn on quotas with a delayed effect */ @@ -351,13 +351,13 @@ xfs_qm_scall_quotaon( xfs_debug(mp, "%s: Can't enforce without acct, flags=%x sbflags=%x", __func__, flags, mp->m_sb.sb_qflags); - return XFS_ERROR(EINVAL); + return -EINVAL; } /* * If everything's up to-date incore, then don't waste time. */ if ((mp->m_qflags & flags) == flags) - return XFS_ERROR(EEXIST); + return -EEXIST; /* * Change sb_qflags on disk but not incore mp->qflags @@ -372,11 +372,11 @@ xfs_qm_scall_quotaon( * There's nothing to change if it's the same. */ if ((qf & flags) == flags && sbflags == 0) - return XFS_ERROR(EEXIST); + return -EEXIST; sbflags |= XFS_SB_QFLAGS; if ((error = xfs_qm_write_sb_changes(mp, sbflags))) - return (error); + return error; /* * If we aren't trying to switch on quota enforcement, we are done. */ @@ -387,10 +387,10 @@ xfs_qm_scall_quotaon( ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != (mp->m_qflags & XFS_GQUOTA_ACCT)) || (flags & XFS_ALL_QUOTA_ENFD) == 0) - return (0); + return 0; if (! XFS_IS_QUOTA_RUNNING(mp)) - return XFS_ERROR(ESRCH); + return -ESRCH; /* * Switch on quota enforcement in core. @@ -399,7 +399,7 @@ xfs_qm_scall_quotaon( mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); - return (0); + return 0; } @@ -426,7 +426,7 @@ xfs_qm_scall_getqstat( if (!xfs_sb_version_hasquota(&mp->m_sb)) { out->qs_uquota.qfs_ino = NULLFSINO; out->qs_gquota.qfs_ino = NULLFSINO; - return (0); + return 0; } out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & @@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv( out->qs_uquota.qfs_ino = NULLFSINO; out->qs_gquota.qfs_ino = NULLFSINO; out->qs_pquota.qfs_ino = NULLFSINO; - return (0); + return 0; } out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & @@ -595,7 +595,7 @@ xfs_qm_scall_setqlim( xfs_qcnt_t hard, soft; if (newlim->d_fieldmask & ~XFS_DQ_MASK) - return EINVAL; + return -EINVAL; if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; @@ -615,7 +615,7 @@ xfs_qm_scall_setqlim( */ error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); if (error) { - ASSERT(error != ENOENT); + ASSERT(error != -ENOENT); goto out_unlock; } xfs_dqunlock(dqp); @@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { xfs_trans_cancel(tp, 0); - return (error); + return error; } qoffi = xfs_trans_get_qoff_item(tp, startqoff, @@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end( */ xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); - return (error); + return error; } @@ -822,7 +822,7 @@ error0: spin_unlock(&mp->m_sb_lock); } *qoffstartp = qoffi; - return (error); + return error; } @@ -850,7 +850,7 @@ xfs_qm_scall_getquota( * our utility programs are concerned. */ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - error = XFS_ERROR(ENOENT); + error = -ENOENT; goto out_put; } @@ -953,7 +953,7 @@ xfs_qm_export_flags( uflags |= FS_QUOTA_GDQ_ENFD; if (flags & XFS_PQUOTA_ENFD) uflags |= FS_QUOTA_PDQ_ENFD; - return (uflags); + return uflags; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 2ad1b9822e92..b238027df987 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -51,7 +51,7 @@ xfs_fs_get_xstate( if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - return -xfs_qm_scall_getqstat(mp, fqs); + return xfs_qm_scall_getqstat(mp, fqs); } STATIC int @@ -63,7 +63,7 @@ xfs_fs_get_xstatev( if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - return -xfs_qm_scall_getqstatv(mp, fqs); + return xfs_qm_scall_getqstatv(mp, fqs); } STATIC int @@ -95,11 +95,11 @@ xfs_fs_set_xstate( switch (op) { case Q_XQUOTAON: - return -xfs_qm_scall_quotaon(mp, flags); + return xfs_qm_scall_quotaon(mp, flags); case Q_XQUOTAOFF: if (!XFS_IS_QUOTA_ON(mp)) return -EINVAL; - return -xfs_qm_scall_quotaoff(mp, flags); + return xfs_qm_scall_quotaoff(mp, flags); } return -EINVAL; @@ -112,7 +112,7 @@ xfs_fs_rm_xquota( { struct xfs_mount *mp = XFS_M(sb); unsigned int flags = 0; - + if (sb->s_flags & MS_RDONLY) return -EROFS; @@ -123,11 +123,11 @@ xfs_fs_rm_xquota( flags |= XFS_DQ_USER; if (uflags & FS_GROUP_QUOTA) flags |= XFS_DQ_GROUP; - if (uflags & FS_USER_QUOTA) + if (uflags & FS_PROJ_QUOTA) flags |= XFS_DQ_PROJ; - return -xfs_qm_scall_trunc_qfiles(mp, flags); -} + return xfs_qm_scall_trunc_qfiles(mp, flags); +} STATIC int xfs_fs_get_dqblk( @@ -142,7 +142,7 @@ xfs_fs_get_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), xfs_quota_type(qid.type), fdq); } @@ -161,7 +161,7 @@ xfs_fs_set_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), xfs_quota_type(qid.type), fdq); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ec5ca65c6211..909e143b87ae 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -863,7 +863,7 @@ xfs_growfs_rt_alloc( XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); if (!error && nmap < 1) - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; if (error) goto error_cancel; /* @@ -903,7 +903,7 @@ xfs_growfs_rt_alloc( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); if (bp == NULL) { - error = XFS_ERROR(EIO); + error = -EIO; error_cancel: xfs_trans_cancel(tp, cancelflags); goto error; @@ -944,9 +944,9 @@ xfs_growfs_rt( xfs_buf_t *bp; /* temporary buffer */ int error; /* error return value */ xfs_mount_t *nmp; /* new (fake) mount structure */ - xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ + xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_drtbno_t nrextents; /* new number of realtime extents */ + xfs_rtblock_t nrextents; /* new number of realtime extents */ uint8_t nrextslog; /* new log2 of sb_rextents */ xfs_extlen_t nrsumblocks; /* new number of summary blocks */ uint nrsumlevels; /* new rt summary levels */ @@ -962,11 +962,11 @@ xfs_growfs_rt( * Initial error checking. */ if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); + return -EPERM; if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) - return XFS_ERROR(EINVAL); + return -EINVAL; if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) return error; /* @@ -976,7 +976,7 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) - return EIO; + return -EIO; if (bp->b_error) { error = bp->b_error; xfs_buf_relse(bp); @@ -1001,7 +1001,7 @@ xfs_growfs_rt( * since we'll log basically the whole summary file at once. */ if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) - return XFS_ERROR(EINVAL); + return -EINVAL; /* * Get the old block counts for bitmap and summary inodes. * These can't change since other growfs callers are locked out. @@ -1208,7 +1208,7 @@ xfs_rtallocate_extent( len, &sumbp, &sb, prod, &r); break; default: - error = EIO; + error = -EIO; ASSERT(0); } if (error) @@ -1247,7 +1247,7 @@ xfs_rtmount_init( if (mp->m_rtdev_targp == NULL) { xfs_warn(mp, "Filesystem has a realtime volume, use rtdev=device option"); - return XFS_ERROR(ENODEV); + return -ENODEV; } mp->m_rsumlevels = sbp->sb_rextslog + 1; mp->m_rsumsize = @@ -1263,7 +1263,7 @@ xfs_rtmount_init( xfs_warn(mp, "realtime mount -- %llu != %llu", (unsigned long long) XFS_BB_TO_FSB(mp, d), (unsigned long long) mp->m_sb.sb_rblocks); - return XFS_ERROR(EFBIG); + return -EFBIG; } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -1272,7 +1272,7 @@ xfs_rtmount_init( xfs_warn(mp, "realtime device size check failed"); if (bp) xfs_buf_relse(bp); - return EIO; + return -EIO; } xfs_buf_relse(bp); return 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 752b63d10300..c642795324af 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -132,7 +132,7 @@ xfs_rtmount_init( return 0; xfs_warn(mp, "Not built with CONFIG_XFS_RT"); - return ENOSYS; + return -ENOSYS; } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtunmount_inodes(m) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8f0333b3f7a0..b194652033cd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -61,6 +61,7 @@ static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; +struct kset *xfs_kset; #define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ #define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ @@ -185,7 +186,7 @@ xfs_parseargs( */ mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); if (!mp->m_fsname) - return ENOMEM; + return -ENOMEM; mp->m_fsname_len = strlen(mp->m_fsname) + 1; /* @@ -204,9 +205,6 @@ xfs_parseargs( */ mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; -#if !XFS_BIG_INUMS - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; -#endif /* * These can be overridden by the mount option parsing. @@ -227,57 +225,57 @@ xfs_parseargs( if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (kstrtoint(value, 10, &mp->m_logbufs)) - return EINVAL; + return -EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) - return EINVAL; + return -EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); if (!mp->m_logname) - return ENOMEM; + return -ENOMEM; } else if (!strcmp(this_char, MNTOPT_MTPT)) { xfs_warn(mp, "%s option not allowed on this system", this_char); - return EINVAL; + return -EINVAL; } else if (!strcmp(this_char, MNTOPT_RTDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); if (!mp->m_rtname) - return ENOMEM; + return -ENOMEM; } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (kstrtoint(value, 10, &iosize)) - return EINVAL; + return -EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (suffix_kstrtoint(value, 10, &iosize)) - return EINVAL; + return -EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -297,27 +295,22 @@ xfs_parseargs( if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (kstrtoint(value, 10, &dsunit)) - return EINVAL; + return -EINVAL; } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); - return EINVAL; + return -EINVAL; } if (kstrtoint(value, 10, &dswidth)) - return EINVAL; + return -EINVAL; } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; -#if !XFS_BIG_INUMS - xfs_warn(mp, "%s option not allowed on this system", - this_char); - return EINVAL; -#endif } else if (!strcmp(this_char, MNTOPT_NOUUID)) { mp->m_flags |= XFS_MOUNT_NOUUID; } else if (!strcmp(this_char, MNTOPT_BARRIER)) { @@ -390,7 +383,7 @@ xfs_parseargs( "irixsgid is now a sysctl(2) variable, option is deprecated."); } else { xfs_warn(mp, "unknown mount option [%s].", this_char); - return EINVAL; + return -EINVAL; } } @@ -400,32 +393,32 @@ xfs_parseargs( if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_warn(mp, "no-recovery mounts must be read-only."); - return EINVAL; + return -EINVAL; } if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); - return EINVAL; + return -EINVAL; } #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); - return EINVAL; + return -EINVAL; } #endif if ((dsunit && !dswidth) || (!dsunit && dswidth)) { xfs_warn(mp, "sunit and swidth must be specified together"); - return EINVAL; + return -EINVAL; } if (dsunit && (dswidth % dsunit != 0)) { xfs_warn(mp, "stripe width (%d) must be a multiple of the stripe unit (%d)", dswidth, dsunit); - return EINVAL; + return -EINVAL; } done: @@ -446,7 +439,7 @@ done: mp->m_logbufs > XLOG_MAX_ICLOGS)) { xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return XFS_ERROR(EINVAL); + return -EINVAL; } if (mp->m_logbsize != -1 && mp->m_logbsize != 0 && @@ -456,7 +449,7 @@ done: xfs_warn(mp, "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", mp->m_logbsize); - return XFS_ERROR(EINVAL); + return -EINVAL; } if (iosizelog) { @@ -465,7 +458,7 @@ done: xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", iosizelog, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG); - return XFS_ERROR(EINVAL); + return -EINVAL; } mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; @@ -597,15 +590,20 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } +/* + * xfs_set_inode32() and xfs_set_inode64() are passed an agcount + * because in the growfs case, mp->m_sb.sb_agcount is not updated + * yet to the potentially higher ag count. + */ xfs_agnumber_t -xfs_set_inode32(struct xfs_mount *mp) +xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) { xfs_agnumber_t index = 0; xfs_agnumber_t maxagi = 0; xfs_sb_t *sbp = &mp->m_sb; xfs_agnumber_t max_metadata; - xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); - xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); + xfs_agino_t agino; + xfs_ino_t ino; xfs_perag_t *pag; /* Calculate how much should be reserved for inodes to meet @@ -620,10 +618,12 @@ xfs_set_inode32(struct xfs_mount *mp) do_div(icount, sbp->sb_agblocks); max_metadata = icount; } else { - max_metadata = sbp->sb_agcount; + max_metadata = agcount; } - for (index = 0; index < sbp->sb_agcount; index++) { + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + + for (index = 0; index < agcount; index++) { ino = XFS_AGINO_TO_INO(mp, index, agino); if (ino > XFS_MAXINUMBER_32) { @@ -648,11 +648,11 @@ xfs_set_inode32(struct xfs_mount *mp) } xfs_agnumber_t -xfs_set_inode64(struct xfs_mount *mp) +xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount) { xfs_agnumber_t index = 0; - for (index = 0; index < mp->m_sb.sb_agcount; index++) { + for (index = 0; index < agcount; index++) { struct xfs_perag *pag; pag = xfs_perag_get(mp, index); @@ -686,7 +686,7 @@ xfs_blkdev_get( xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); } - return -error; + return error; } STATIC void @@ -756,7 +756,7 @@ xfs_open_devices( if (rtdev == ddev || rtdev == logdev) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); - error = EINVAL; + error = -EINVAL; goto out_close_rtdev; } } @@ -764,7 +764,7 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = ENOMEM; + error = -ENOMEM; mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; @@ -1188,6 +1188,7 @@ xfs_fs_remount( char *options) { struct xfs_mount *mp = XFS_M(sb); + xfs_sb_t *sbp = &mp->m_sb; substring_t args[MAX_OPT_ARGS]; char *p; int error; @@ -1208,10 +1209,10 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: - mp->m_maxagi = xfs_set_inode64(mp); + mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount); break; case Opt_inode32: - mp->m_maxagi = xfs_set_inode32(mp); + mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount); break; default: /* @@ -1295,7 +1296,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp); + return xfs_fs_log_dummy(mp); } STATIC int @@ -1314,7 +1315,7 @@ xfs_fs_show_options( struct seq_file *m, struct dentry *root) { - return -xfs_showargs(XFS_M(root->d_sb), m); + return xfs_showargs(XFS_M(root->d_sb), m); } /* @@ -1336,14 +1337,14 @@ xfs_finish_flags( mp->m_logbsize < mp->m_sb.sb_logsunit) { xfs_warn(mp, "logbuf size must be greater than or equal to log stripe size"); - return XFS_ERROR(EINVAL); + return -EINVAL; } } else { /* Fail a mount if the logbuf is larger than 32K */ if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { xfs_warn(mp, "logbuf size for version 1 logs must be 16K or 32K"); - return XFS_ERROR(EINVAL); + return -EINVAL; } } @@ -1355,7 +1356,7 @@ xfs_finish_flags( xfs_warn(mp, "Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", MNTOPT_NOATTR2, MNTOPT_ATTR2); - return XFS_ERROR(EINVAL); + return -EINVAL; } /* @@ -1372,7 +1373,7 @@ xfs_finish_flags( if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { xfs_warn(mp, "cannot mount a read-only filesystem as read-write"); - return XFS_ERROR(EROFS); + return -EROFS; } if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && @@ -1380,7 +1381,7 @@ xfs_finish_flags( !xfs_sb_version_has_pquotino(&mp->m_sb)) { xfs_warn(mp, "Super block does not support project and group quota together"); - return XFS_ERROR(EINVAL); + return -EINVAL; } return 0; @@ -1394,7 +1395,7 @@ xfs_fs_fill_super( { struct inode *root; struct xfs_mount *mp = NULL; - int flags = 0, error = ENOMEM; + int flags = 0, error = -ENOMEM; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) @@ -1428,11 +1429,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = -xfs_init_mount_workqueues(mp); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = -xfs_icsb_init_counters(mp); + error = xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; @@ -1474,12 +1475,12 @@ xfs_fs_fill_super( root = igrab(VFS_I(mp->m_rootip)); if (!root) { - error = ENOENT; + error = -ENOENT; goto out_unmount; } sb->s_root = d_make_root(root); if (!sb->s_root) { - error = ENOMEM; + error = -ENOMEM; goto out_unmount; } @@ -1499,7 +1500,7 @@ out_destroy_workqueues: xfs_free_fsname(mp); kfree(mp); out: - return -error; + return error; out_unmount: xfs_filestream_unmount(mp); @@ -1761,9 +1762,15 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); + if (!xfs_kset) { + error = -ENOMEM; + goto out_sysctl_unregister;; + } + error = xfs_qm_init(); if (error) - goto out_sysctl_unregister; + goto out_kset_unregister; error = register_filesystem(&xfs_fs_type); if (error) @@ -1772,6 +1779,8 @@ init_xfs_fs(void) out_qm_exit: xfs_qm_exit(); + out_kset_unregister: + kset_unregister(xfs_kset); out_sysctl_unregister: xfs_sysctl_unregister(); out_cleanup_procfs: @@ -1793,6 +1802,7 @@ exit_xfs_fs(void) { xfs_qm_exit(); unregister_filesystem(&xfs_fs_type); + kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index bbe3d15a7904..2b830c2f322e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -44,16 +44,6 @@ extern void xfs_qm_exit(void); # define XFS_REALTIME_STRING #endif -#if XFS_BIG_BLKNOS -# if XFS_BIG_INUMS -# define XFS_BIGFS_STRING "large block/inode numbers, " -# else -# define XFS_BIGFS_STRING "large block numbers, " -# endif -#else -# define XFS_BIGFS_STRING -#endif - #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -64,7 +54,6 @@ extern void xfs_qm_exit(void); #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ - XFS_BIGFS_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; @@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); -extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); -extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index d69363c833e1..6a944a2cd36f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -76,15 +76,15 @@ xfs_readlink_bmap( bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, &xfs_symlink_buf_ops); if (!bp) - return XFS_ERROR(ENOMEM); + return -ENOMEM; error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* bad CRC means corrupted metadata */ - if (error == EFSBADCRC) - error = EFSCORRUPTED; + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; goto out; } byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -95,7 +95,7 @@ xfs_readlink_bmap( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { - error = EFSCORRUPTED; + error = -EFSCORRUPTED; xfs_alert(mp, "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", offset, byte_cnt, ip->i_ino); @@ -135,7 +135,7 @@ xfs_readlink( trace_xfs_readlink(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -148,7 +148,7 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; goto out; } @@ -203,14 +203,14 @@ xfs_symlink( trace_xfs_symlink(dp, link_name); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; /* * Check component lengths of the target path name. */ pathlen = strlen(target_path); if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); + return -ENAMETOOLONG; udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); @@ -238,7 +238,7 @@ xfs_symlink( fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); - if (error == ENOSPC && fs_blocks == 0) { + if (error == -ENOSPC && fs_blocks == 0) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } @@ -254,7 +254,7 @@ xfs_symlink( * Check whether the directory allows new symlinks or not. */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); + error = -EPERM; goto error_return; } @@ -284,7 +284,7 @@ xfs_symlink( error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); if (error) { - if (error == ENOSPC) + if (error == -ENOSPC) goto error_return; goto error1; } @@ -348,7 +348,7 @@ xfs_symlink( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error2; } bp->b_ops = &xfs_symlink_buf_ops; @@ -489,7 +489,7 @@ xfs_inactive_symlink_rmt( XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); if (!bp) { - error = ENOMEM; + error = -ENOMEM; goto error_bmap_cancel; } xfs_trans_binval(tp, bp); @@ -562,7 +562,7 @@ xfs_inactive_symlink( trace_xfs_inactive_symlink(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -580,7 +580,7 @@ xfs_inactive_symlink( __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(0); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } if (ip->i_df.if_flags & XFS_IFINLINE) { diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c new file mode 100644 index 000000000000..9835139ce1ec --- /dev/null +++ b/fs/xfs/xfs_sysfs.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_sysfs.h" +#include "xfs_log_format.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" + +struct xfs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(char *buf, void *data); + ssize_t (*store)(const char *buf, size_t count, void *data); +}; + +static inline struct xfs_sysfs_attr * +to_attr(struct attribute *attr) +{ + return container_of(attr, struct xfs_sysfs_attr, attr); +} + +#define XFS_SYSFS_ATTR_RW(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) +#define XFS_SYSFS_ATTR_RO(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr + +/* + * xfs_mount kobject. This currently has no attributes and thus no need for show + * and store helpers. The mp kobject serves as the per-mount parent object that + * is identified by the fsname under sysfs. + */ + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, +}; + +/* xlog */ + +STATIC ssize_t +log_head_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + spin_lock(&log->l_icloglock); + cycle = log->l_curr_cycle; + block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_head_lsn); + +STATIC ssize_t +log_tail_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_tail_lsn); + +STATIC ssize_t +reserve_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(reserve_grant_head); + +STATIC ssize_t +write_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(write_grant_head); + +static struct attribute *xfs_log_attrs[] = { + ATTR_LIST(log_head_lsn), + ATTR_LIST(log_tail_lsn), + ATTR_LIST(reserve_grant_head), + ATTR_LIST(write_grant_head), + NULL, +}; + +static inline struct xlog * +to_xlog(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xlog, l_kobj); +} + +STATIC ssize_t +xfs_log_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, log) : 0; +} + +STATIC ssize_t +xfs_log_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0; +} + +static struct sysfs_ops xfs_log_ops = { + .show = xfs_log_show, + .store = xfs_log_store, +}; + +struct kobj_type xfs_log_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_log_ops, + .default_attrs = xfs_log_attrs, +}; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h new file mode 100644 index 000000000000..54a2091183c0 --- /dev/null +++ b/fs/xfs/xfs_sysfs.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __XFS_SYSFS_H__ +#define __XFS_SYSFS_H__ + +extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern struct kobj_type xfs_log_ktype; /* xlog */ + +static inline struct xfs_kobj * +to_kobj(struct kobject *kobject) +{ + return container_of(kobject, struct xfs_kobj, kobject); +} + +static inline void +xfs_sysfs_release(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + complete(&kobj->complete); +} + +static inline int +xfs_sysfs_init( + struct xfs_kobj *kobj, + struct kobj_type *ktype, + struct xfs_kobj *parent_kobj, + const char *name) +{ + init_completion(&kobj->complete); + return kobject_init_and_add(&kobj->kobject, ktype, + &parent_kobj->kobject, "%s", name); +} + +static inline void +xfs_sysfs_del( + struct xfs_kobj *kobj) +{ + kobject_del(&kobj->kobject); + kobject_put(&kobj->kobject); + wait_for_completion(&kobj->complete); +} + +#endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d03932564ccb..30e8e3410955 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -190,7 +190,7 @@ xfs_trans_reserve( -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - return (XFS_ERROR(ENOSPC)); + return -ENOSPC; } tp->t_blk_res += blocks; } @@ -241,7 +241,7 @@ xfs_trans_reserve( error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, -((int64_t)rtextents), rsvd); if (error) { - error = XFS_ERROR(ENOSPC); + error = -ENOSPC; goto undo_log; } tp->t_rtx_res += rtextents; @@ -874,7 +874,7 @@ xfs_trans_commit( goto out_unreserve; if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); + error = -EIO; goto out_unreserve; } @@ -917,7 +917,7 @@ out_unreserve: if (tp->t_ticket) { commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); if (commit_lsn == -1 && !error) - error = XFS_ERROR(EIO); + error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); @@ -1024,7 +1024,7 @@ xfs_trans_roll( */ error = xfs_trans_commit(trans, 0); if (error) - return (error); + return error; trans = *tpp; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index cb0f3a84cc68..859482f53b5a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -762,7 +762,7 @@ xfs_trans_ail_init( ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); if (!ailp) - return ENOMEM; + return -ENOMEM; ailp->xa_mount = mp; INIT_LIST_HEAD(&ailp->xa_ail); @@ -781,7 +781,7 @@ xfs_trans_ail_init( out_free_ailp: kmem_free(ailp); - return ENOMEM; + return -ENOMEM; } void diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index b8eef0549f3f..96c898e7ac9a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -166,7 +166,7 @@ xfs_trans_get_buf_map( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); - return (bp); + return bp; } bp = xfs_buf_get_map(target, map, nmaps, flags); @@ -178,7 +178,7 @@ xfs_trans_get_buf_map( _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_fspriv); - return (bp); + return bp; } /* @@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t *tp, * Default to just trying to lock the superblock buffer * if tp is NULL. */ - if (tp == NULL) { - return (xfs_getsb(mp, flags)); - } + if (tp == NULL) + return xfs_getsb(mp, flags); /* * If the superblock buffer already has this transaction @@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t *tp, ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_getsb_recur(bip); - return (bp); + return bp; } bp = xfs_getsb(mp, flags); @@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t *tp, _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_getsb(bp->b_fspriv); - return (bp); + return bp; } #ifdef DEBUG @@ -267,7 +266,7 @@ xfs_trans_read_buf_map( bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) return (flags & XBF_TRYLOCK) ? - EAGAIN : XFS_ERROR(ENOMEM); + -EAGAIN : -ENOMEM; if (bp->b_error) { error = bp->b_error; @@ -277,8 +276,8 @@ xfs_trans_read_buf_map( xfs_buf_relse(bp); /* bad CRC means corrupted metadata */ - if (error == EFSBADCRC) - error = EFSCORRUPTED; + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; } #ifdef DEBUG @@ -287,7 +286,7 @@ xfs_trans_read_buf_map( if (((xfs_req_num++) % xfs_error_mod) == 0) { xfs_buf_relse(bp); xfs_debug(mp, "Returning error!"); - return XFS_ERROR(EIO); + return -EIO; } } } @@ -343,8 +342,8 @@ xfs_trans_read_buf_map( xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); /* bad CRC means corrupted metadata */ - if (error == EFSBADCRC) - error = EFSCORRUPTED; + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; } } @@ -355,7 +354,7 @@ xfs_trans_read_buf_map( if (XFS_FORCED_SHUTDOWN(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); *bpp = NULL; - return XFS_ERROR(EIO); + return -EIO; } @@ -372,7 +371,7 @@ xfs_trans_read_buf_map( if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? - 0 : XFS_ERROR(ENOMEM); + 0 : -ENOMEM; } if (bp->b_error) { error = bp->b_error; @@ -384,8 +383,8 @@ xfs_trans_read_buf_map( xfs_buf_relse(bp); /* bad CRC means corrupted metadata */ - if (error == EFSBADCRC) - error = EFSCORRUPTED; + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; } #ifdef DEBUG @@ -396,7 +395,7 @@ xfs_trans_read_buf_map( SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); xfs_debug(mp, "Returning trans error!"); - return XFS_ERROR(EIO); + return -EIO; } } } @@ -414,7 +413,7 @@ shutdown_abort: trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; - return XFS_ERROR(EIO); + return -EIO; } /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 41172861e857..846e061c2e98 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -722,8 +722,8 @@ xfs_trans_dqresv( error_return: xfs_dqunlock(dqp); if (flags & XFS_QMOPT_ENOSPC) - return ENOSPC; - return EDQUOT; + return -ENOSPC; + return -EDQUOT; } diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 65c6e6650b1a..b79dc66b2ecd 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -38,43 +38,18 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */ typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ -/* - * These types are 64 bits on disk but are either 32 or 64 bits in memory. - * Disk based types: - */ -typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */ -typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */ -typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */ -typedef __uint64_t xfs_dfiloff_t; /* block number in a file */ -typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */ - -/* - * Memory based types are conditional. - */ -#if XFS_BIG_BLKNOS typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ -typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -#else -typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ -typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ -typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */ -typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -#endif typedef __uint64_t xfs_fileoff_t; /* block number in a file */ -typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ /* * Null values for the types. */ -#define NULLDFSBNO ((xfs_dfsbno_t)-1) -#define NULLDRFSBNO ((xfs_drfsbno_t)-1) -#define NULLDRTBNO ((xfs_drtbno_t)-1) -#define NULLDFILOFF ((xfs_dfiloff_t)-1) - #define NULLFSBLOCK ((xfs_fsblock_t)-1) #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h deleted file mode 100644 index e8a77383c0d5..000000000000 --- a/fs/xfs/xfs_vnode.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VNODE_H__ -#define __XFS_VNODE_H__ - -#include "xfs_fs.h" - -struct file; -struct xfs_inode; -struct attrlist_cursor_kern; - -/* - * Flags for read/write calls - same values as IRIX - */ -#define IO_ISDIRECT 0x00004 /* bypass page cache */ -#define IO_INVIS 0x00020 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { IO_ISDIRECT, "DIRECT" }, \ - { IO_INVIS, "INVIS"} - -/* - * Some useful predicates. - */ -#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) -#define VN_CACHED(vp) (vp->i_mapping->nrpages) -#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ - PAGECACHE_TAG_DIRTY) - - -#endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 78ed92a46fdd..93455b998041 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name, value = NULL; } - error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); if (error) return error; return asize; @@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, xflags |= ATTR_REPLACE; if (!value) - return -xfs_attr_remove(ip, (unsigned char *)name, xflags); - return -xfs_attr_set(ip, (unsigned char *)name, + return xfs_attr_remove(ip, (unsigned char *)name, xflags); + return xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); } |