diff options
Diffstat (limited to 'fs')
196 files changed, 3269 insertions, 2653 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a16b0ff497ca..d8223209d4b1 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 952aeb048349..9852bdf34d76 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 3952121f2f28..25b23b1e7f22 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -5,14 +5,6 @@ #include <linux/mutex.h> #include <linux/workqueue.h> -/* AmigaOS allows file names with up to 30 characters length. - * Names longer than that will be silently truncated. If you - * want to disallow this, comment out the following #define. - * Creating filesystem objects with longer names will then - * result in an error (ENAMETOOLONG). - */ -/*#define AFFS_NO_TRUNCATE */ - /* Ugly macros make the code more pretty. */ #define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) @@ -28,7 +20,6 @@ #define AFFS_CACHE_SIZE PAGE_SIZE -#define AFFS_MAX_PREALLOC 32 #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) @@ -118,6 +109,7 @@ struct affs_sb_info { #define SF_OFS 0x0200 /* Old filesystem */ #define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) @@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); -extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); -extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); -extern int affs_check_name(const unsigned char *name, int len); +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d9a43674cb94..533a322c41c0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) function,ErrorBuffer); } +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + +} + /* Check if the name is valid for a affs object. */ int -affs_check_name(const unsigned char *name, int len) +affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) -#ifdef AFFS_NO_TRUNCATE - return -ENAMETOOLONG; -#else - len = 30; -#endif - + if (len > 30) { + if (notruncate) + return -ENAMETOOLONG; + else + len = 30; + } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1eba8c3644e..cbbda476a805 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx) int hash_pos; int chain_pos; u32 ino; + int error = 0; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n", + inode->i_ino, (unsigned long)ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) - goto readdir_out; + goto out_unlock_dir; /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. @@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - return -EIO; + error = -EIO; + goto out_brelse_dir; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -107,29 +110,34 @@ inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { - affs_error(sb, "readdir","Cannot read block %d", ino); + affs_error(sb, "readdir", + "Cannot read block %d", ino); break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", + pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", " + "ino=%u), hash=%d, f_pos=%x\n", namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) - goto readdir_done; + goto done; ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } -readdir_done: +done: file->f_version = inode->i_version; file->private_data = (void *)(long)ino; + affs_brelse(fh_bh); -readdir_out: +out_brelse_dir: affs_brelse(dir_bh); - affs_brelse(fh_bh); + +out_unlock_dir: affs_unlock_dir(inode); - return 0; + return error; } diff --git a/fs/affs/namei.c b/fs/affs/namei.c index c36cbb4537a2..6dae1ccd176d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name, qstr->len); + i = affs_check_name(qstr->name, qstr->len, notruncate); if (i) return i; @@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + } + static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper) + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) { const u8 *aname = str; const u8 *bname = name->name; @@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len, * must be valid. 'name' must be validated first. */ - if (affs_check_name(name->name, name->len)) + if (affs_check_name(name->name, name->len, notruncate)) return 1; /* @@ -126,13 +132,18 @@ static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_toupper); + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); } + static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + } /* @@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); - retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); + if (retval) return retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index 307453086c3f..6d589f28bf9b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -128,7 +128,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), @@ -163,7 +163,7 @@ static const struct super_operations affs_sops = { }; enum { - Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; @@ -172,6 +172,7 @@ static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, {Opt_prefix, "prefix=%s"}, {Opt_protect, "protect"}, {Opt_reserved, "reserved=%u"}, @@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, case Opt_mufs: *mount_opts |= SF_MUFS; break; + case Opt_notruncate: + *mount_opts |= SF_NO_TRUNCATE; + break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) @@ -52,7 +52,8 @@ struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ - unsigned head; + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; @@ -243,6 +244,11 @@ static void aio_free_ring(struct kioctx *ctx) { int i; + /* Disconnect the kiotx from the ring file. This prevents future + * accesses to the kioctx from page migration. + */ + put_aio_ring_file(ctx); + for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -254,8 +260,6 @@ static void aio_free_ring(struct kioctx *ctx) put_page(page); } - put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; @@ -283,29 +287,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, { struct kioctx *ctx; unsigned long flags; + pgoff_t idx; int rc; rc = 0; - /* Make sure the old page hasn't already been changed */ + /* mapping->private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - if (ctx->ring_pages[idx] != old) - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (!ctx) { + rc = -EINVAL; + goto out; + } + + /* The ring_lock mutex. The prevents aio_read_events() from writing + * to the ring's head, and prevents page migration from mucking in + * a partially initialized kiotx. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + rc = -EAGAIN; + goto out; + } + + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + /* Make sure the old page hasn't already been changed */ + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; } else rc = -EINVAL; - spin_unlock(&mapping->private_lock); if (rc != 0) - return rc; + goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); @@ -314,38 +327,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); - return rc; + goto out_unlock; } - /* We can potentially race against kioctx teardown here. Use the - * address_space's private data lock to protect the mapping's - * private_data. + /* Take completion_lock to prevent other writes to the ring buffer + * while the old page is copied to the new. This prevents new + * events from being lost. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - /* And only do the move if things haven't changed */ - if (ctx->ring_pages[idx] == old) - ctx->ring_pages[idx] = new; - else - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else - rc = -EBUSY; - spin_unlock(&mapping->private_lock); + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + BUG_ON(ctx->ring_pages[idx] != old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (rc == MIGRATEPAGE_SUCCESS) - put_page(old); - else - put_page(new); + /* The old page is no longer accessible. */ + put_page(old); +out_unlock: + mutex_unlock(&ctx->ring_lock); +out: + spin_unlock(&mapping->private_lock); return rc; } #endif @@ -380,7 +381,7 @@ static int aio_setup_ring(struct kioctx *ctx) file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; - return -EAGAIN; + return -ENOMEM; } ctx->aio_ring_file = file; @@ -415,7 +416,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (unlikely(i != nr_pages)) { aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -429,7 +430,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); @@ -556,6 +557,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + /* While kioctx setup is in progress, + * we are protected from page migration + * changes ring_pages by ->ring_lock. + */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); @@ -640,24 +645,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_users)) - goto err; - - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) - goto err; - spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); + /* Protect against page migration throughout kiotx setup by keeping + * the ring_lock mutex held until setup is complete. */ + mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; + ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; - if (aio_setup_ring(ctx) < 0) + err = aio_setup_ring(ctx); + if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); @@ -683,6 +692,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (err) goto err_cleanup; + /* Release the ring_lock mutex now that all setup is complete. */ + mutex_unlock(&ctx->ring_lock); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; @@ -692,6 +704,7 @@ err_cleanup: err_ctx: aio_free_ring(ctx); err: + mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); @@ -1024,6 +1037,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 3182c0e68b42..232e03d4780d 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size < sizeof(tmp)) return ERR_PTR(-EINVAL); + if (tmp.size > (PATH_MAX + sizeof(tmp))) + return ERR_PTR(-ENAMETOOLONG); + return memdup_user(in, tmp.size); } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 29aa5cf6639b..7041ac35ace8 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&bi->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0f59799fa105..aa3cb626671e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -584,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - unsigned long def_flags = 0; struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; @@ -724,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* OK, This is the point of no return */ - current->mm->def_flags = def_flags; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 29696b78d1f4..1c2ce0c87711 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw) */ int bio_integrity_enabled(struct bio *bio) { + if (!bio_is_rw(bio)) + return 0; + /* Already protected? */ if (bio_integrity(bio)) return 0; @@ -309,10 +312,9 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; - struct bio_vec bv; - struct bvec_iter iter; + struct bio_vec *bv; sector_t sector; - unsigned int sectors, ret = 0; + unsigned int sectors, ret = 0, i; void *prot_buf = bio->bi_integrity->bip_buf; if (operate) @@ -323,16 +325,16 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, iter) { - void *kaddr = kmap_atomic(bv.bv_page); - bix.data_buf = kaddr + bv.bv_offset; - bix.data_size = bv.bv_len; + bio_for_each_segment_all(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; bix.sector = sector; - if (operate) { + if (operate) bi->generate_fn(&bix); - } else { + else { ret = bi->verify_fn(&bix); if (ret) { kunmap_atomic(kaddr); @@ -340,7 +342,7 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate) } } - sectors = bv.bv_len / bi->sector_size; + sectors = bv->bv_len / bi->sector_size; sector += sectors; prot_buf += sectors * bi->tuple_size; @@ -1002,7 +1002,7 @@ struct bio_map_data { }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int is_our_pages) { memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); @@ -1022,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -1120,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; @@ -1259,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { int i, j; @@ -1407,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user); * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio *bio; diff --git a/fs/block_dev.c b/fs/block_dev.c index ba0d2b05bb78..552a8d13bc32 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); if (ret > 0) { ssize_t err; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ecb5832c0967..5a201d81049c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -323,6 +323,8 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { + if (!wq) + return; wq->normal->max_active = max; if (wq->high) wq->high->max_active = max; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index aad7201ad11b..10db21fa0926 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -330,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - root_level = btrfs_old_root_level(root, time_seq); + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) { srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -1099,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * * returns 0 on success, < 0 on error. */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1137,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return 0; } +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + int ret; + + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; +} + /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1516,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, @@ -1526,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1549,6 +1568,8 @@ out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); } return ret; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 88d1b1eedc9c..1bcfcdb23cf4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2769,9 +2769,13 @@ again: * the commit roots are read only * so we always do read locks */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, { int ret; int cmp; - struct btrfs_trans_handle *trans = NULL; struct btrfs_path *left_path = NULL; struct btrfs_path *right_path = NULL; struct btrfs_key left_key; @@ -5378,9 +5381,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, u64 right_blockptr; u64 left_gen; u64 right_gen; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; left_path = btrfs_alloc_path(); if (!left_path) { @@ -5404,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Strategy: Go to the first items of both trees. Then do * @@ -5455,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * the right if possible or go up and right. */ + down_read(&left_root->fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5464,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5482,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = advance_right = 0; while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "BTRFS: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, @@ -5672,14 +5598,6 @@ out: btrfs_free_path(left_path); btrfs_free_path(right_path); kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bc96c03dd259..4c48df572bd6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -609,6 +609,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; }; /* @@ -986,7 +987,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) enum btrfs_raid_types { BTRFS_RAID_RAID10, @@ -1018,6 +1020,12 @@ enum btrfs_raid_types { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + #define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ BTRFS_AVAIL_ALLOC_BIT_SINGLE) @@ -1440,7 +1448,7 @@ struct btrfs_fs_info { */ struct mutex ordered_extent_flush_mutex; - struct rw_semaphore extent_commit_sem; + struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1711,7 +1719,6 @@ struct btrfs_root { struct btrfs_block_rsv *block_rsv; /* free ino cache stuff */ - struct mutex fs_commit_mutex; struct btrfs_free_space_ctl *free_ino_ctl; enum btrfs_caching_type cached; spinlock_t cache_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd0f752b797b..029d46c2e170 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; + bool need_lock = (current->journal_info == + (void *)BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); if (extent_buffer_uptodate(eb) && @@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, "found %llu\n", eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(eb); + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -1546,7 +1564,6 @@ int btrfs_init_fs_root(struct btrfs_root *root) root->subv_writers = writers; btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); @@ -2324,7 +2341,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6b6a6e3e735..1306487c82cf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -443,10 +443,10 @@ next: break; if (need_resched() || - rwsem_is_contended(&fs_info->extent_commit_sem)) { + rwsem_is_contended(&fs_info->commit_root_sem)) { caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; @@ -513,7 +513,7 @@ next: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); @@ -633,10 +633,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); @@ -2444,7 +2444,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root)) { + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); continue; @@ -5470,7 +5471,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; struct btrfs_space_info *space_info; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -5489,7 +5490,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); list_for_each_entry_rcu(space_info, &fs_info->space_info, list) percpu_counter_set(&space_info->total_bytes_pinned, 0); @@ -5744,6 +5745,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -8255,14 +8258,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8336,9 +8339,15 @@ static void __link_block_group(struct btrfs_space_info *space_info, struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) { + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) { struct kobject *kobj = &space_info->block_group_kobjs[index]; int ret; @@ -8350,8 +8359,6 @@ static void __link_block_group(struct btrfs_space_info *space_info, kobject_put(&space_info->kobj); } } - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); } static struct btrfs_block_group_cache * diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ae69a00387e7..3955e475ceec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -749,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -769,7 +770,10 @@ again: if (start > end) break; - cond_resched_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } } out: spin_unlock(&tree->lock); @@ -4306,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 58b27e5ab521..c488b45237bf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e1ffb1e22898..eb742c07e7a4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -1665,7 +1660,7 @@ again: static ssize_t __btrfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + size_t count, size_t ocount) { struct file *file = iocb->ki_filp; struct iov_iter i; @@ -1674,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) @@ -1693,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1725,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; @@ -1781,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + iov->iov_len, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1793,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + pos, count, ocount); } else { struct iov_iter i; @@ -1801,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = __btrfs_buffered_write(file, &i, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); @@ -2025,6 +2022,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ab485e57b6fe..cc8ca193d830 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -55,7 +55,7 @@ static int caching_kthread(void *data) key.type = BTRFS_INODE_ITEM_KEY; again: /* need to make sure the commit_root doesn't disappear */ - mutex_lock(&root->fs_commit_mutex); + down_read(&fs_info->commit_root_sem); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -88,7 +88,7 @@ again: btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); root->cache_progress = last; - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; } else @@ -127,7 +127,7 @@ next: btrfs_unpin_free_ino(root); out: wake_up(&root->cache_wait); - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -223,11 +223,11 @@ again: * or the caching work is done. */ - mutex_lock(&root->fs_commit_mutex); + down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { spin_unlock(&root->cache_lock); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); goto again; } spin_unlock(&root->cache_lock); @@ -240,7 +240,7 @@ again: else __btrfs_add_free_space(pinned, objectid, 1); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); } } @@ -250,7 +250,7 @@ again: * and others will just be dropped, because the commit root we were * searching has changed. * - * Must be called with root->fs_commit_mutex held + * Must be called with root->fs_info->commit_root_sem held */ void btrfs_unpin_free_ino(struct btrfs_root *root) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06e9a4152b14..5f805bc944fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if ((end - start + 1) <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -1271,6 +1279,15 @@ next_slot: disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_nocow_write(root); + if (!err) + goto out_check; + } + /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. @@ -1289,6 +1306,8 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto next_slot; } if (!nocow) { @@ -1306,8 +1325,11 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } cow_start = (u64)-1; } @@ -1354,8 +1376,11 @@ out_check: BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } } extent_clear_unlock_delalloc(inode, cur_offset, @@ -1363,6 +1388,8 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_nocow_write(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -8476,19 +8503,20 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, else iput(inode); ret = -ENOMEM; - break; + goto out; } list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); ret++; if (nr != -1 && ret >= nr) - break; + goto out; cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); +out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0401397b5c92..e79ff6b90cb7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1472,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; + char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1539,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, mod = 1; sizestr++; } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_free; } @@ -3140,8 +3141,9 @@ process_slot: new_key.offset + datal, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EINVAL) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3538,6 +3540,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; @@ -3596,6 +3603,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -4531,9 +4553,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, } args64 = kmalloc(sizeof(*args64), GFP_NOFS); - if (IS_ERR(args64)) { - ret = PTR_ERR(args64); - args64 = NULL; + if (!args64) { + ret = -ENOMEM; goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index def428a25b2a..7f92ab1daa87 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { - struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; u64 last_snap; @@ -2375,26 +2374,6 @@ again: list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; - } else if (!ret) { - /* - * recover the last snapshot tranid to avoid - * the space balance break NOCOW. - */ - root = read_fs_root(rc->extent_root->fs_info, - objectid); - if (IS_ERR(root)) - continue; - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - - /* Check if the fs/file tree was snapshoted or not. */ - if (btrfs_root_last_snapshot(&root->root_item) == - otransid - 1) - btrfs_set_root_last_snapshot(&root->root_item, - last_snap); - - btrfs_end_transaction(trans, root); } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 93e6d7172844..0be77993378e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2235,6 +2235,47 @@ behind_scrub_pages: return 0; } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -2256,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical; u64 logical; u64 logic_end; + u64 physical_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2269,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; - int stop_loop; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - if (num >= nr_data_stripes(map)) { - return 0; - } - } + int stop_loop = 0; nstripes = length; + physical = map->stripes[num].physical; offset = 0; do_div(nstripes, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2296,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; } else { increment = map->stripe_len; mirror_num = 1; @@ -2319,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * to not hold off transaction commits */ logical = base + offset; - + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); @@ -2328,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.objectid = logical; key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; + key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); @@ -2338,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; + key_end.offset = logic_end; reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); if (!IS_ERR(reada1)) @@ -2356,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* * now find all extents for each stripe and scrub them */ - logical = base + offset; - physical = map->stripes[num].physical; - logic_end = logical + increment * nstripes; ret = 0; - while (logical < logic_end) { + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } /* * canceled? */ @@ -2504,15 +2559,29 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - logical += increment; - physical += map->stripe_len; - + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } if (logical < key.objectid + bytes) { cond_resched(); goto again; } - if (logical >= logic_end) { + if (physical >= physical_end) { stop_loop = 1; break; } @@ -2521,6 +2590,7 @@ next: path->slots[0]++; } btrfs_release_path(path); +skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9b6da9d55f9a..1ac3ca98c429 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -493,6 +493,7 @@ static struct btrfs_path *alloc_path_for_send(void) return NULL; path->search_commit_root = 1; path->skip_locking = 1; + path->need_commit_sem = 1; return path; } @@ -771,29 +772,22 @@ out: /* * Helper function to retrieve some fields from an inode item. */ -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) { int ret; struct btrfs_inode_item *ii; struct btrfs_key key; - struct btrfs_path *path; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; if (ret) { - ret = -ENOENT; - goto out; + if (ret > 0) + ret = -ENOENT; + return ret; } ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -811,7 +805,22 @@ static int get_inode_info(struct btrfs_root *root, if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); btrfs_free_path(path); return ret; } @@ -1085,6 +1094,7 @@ out: struct backref_ctx { struct send_ctx *sctx; + struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1155,8 +1165,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, - NULL); + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); if (ret < 0) return ret; @@ -1235,12 +1246,17 @@ static int find_extent_clone(struct send_ctx *sctx, if (!tmp_path) return -ENOMEM; + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); if (!backref_ctx) { ret = -ENOMEM; goto out; } + backref_ctx->path = tmp_path; + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1268,8 +1284,10 @@ static int find_extent_clone(struct send_ctx *sctx, } logical = disk_byte + btrfs_file_extent_offset(eb, fi); + down_read(&sctx->send_root->fs_info->commit_root_sem); ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(tmp_path); if (ret < 0) @@ -4418,6 +4436,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) p = fs_path_alloc(); if (!p) return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); @@ -4425,9 +4446,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); @@ -4968,7 +4986,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (S_ISREG(sctx->cur_inode_mode)) { if (need_send_hole(sctx)) { - if (sctx->cur_inode_last_extent == (u64)-1) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { ret = get_last_extent(sctx, (u64)-1); if (ret) goto out; @@ -5367,57 +5387,21 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *eb; int slot; - u64 start_ctransid; - u64 ctransid; path = alloc_path_for_send(); if (!path) return -ENOMEM; - spin_lock(&send_root->root_item_lock); - start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - /* - * Make sure the tree has not changed after re-joining. We detect this - * by comparing start_ctransid and ctransid. They should always match. - */ - spin_lock(&send_root->root_item_lock); - ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - - if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "BTRFS: the root that you're trying to " - "send was modified in between. This is " - "probably a bug.\n"); - ret = -EIO; - goto out; - } - ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -5425,19 +5409,6 @@ join_trans: goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -5465,12 +5436,6 @@ out_finish: out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -5718,7 +5683,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); + current->journal_info = NULL; if (ret < 0) goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9dbf42395153..5011aadacab8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,8 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + static const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -1185,6 +1187,26 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); kfree(newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + return ERR_PTR(r); + } + } + } + if (IS_ERR(mnt)) return ERR_CAST(mnt); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a04707f740d6..7579f6d0b854 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_root(struct btrfs_root *root) +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + } + up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, @@ -208,6 +219,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info) { + if (current->journal_info && + current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -919,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; } - if (root != root->fs_info->extent_root) - switch_commit_root(root); - return 0; } @@ -977,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); - + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); return 0; @@ -1042,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, smp_wmb(); if (root->commit_root != root->node) { - mutex_lock(&root->fs_commit_mutex); - switch_commit_root(root); - btrfs_unpin_free_ino(root); - mutex_unlock(&root->fs_commit_mutex); - + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } @@ -1857,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); update_super_roots(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6ac037e9f9f0..b57b924e8e03 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,7 @@ struct btrfs_transaction { struct list_head pending_snapshots; struct list_head ordered_operations; struct list_head pending_chunks; + struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -78,6 +79,8 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) +#define BTRFS_SEND_TRANS_STUB 1 + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d241130a32fd..49d7fab73360 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -448,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ static noinline int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -455,6 +463,7 @@ static noinline int device_list_add(const char *path, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; + int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); fs_devices = find_fsid(disk_super->fsid); @@ -495,6 +504,7 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); + ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); @@ -513,7 +523,8 @@ static noinline int device_list_add(const char *path, fs_devices->latest_trans = found_transid; } *fs_devices_ret = fs_devices; - return 0; + + return ret; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -910,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); - } else { - printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); - } - - printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); - ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } + + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; diff --git a/fs/buffer.c b/fs/buffer.c index 8c53a2b15ecb..9ddb9fc7d923 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { unsigned block_start, block_end, blocksize; unsigned to; @@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e484..5b99bafc31d1 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 6494d9f673aa..c0a681705104 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 8c44fdd4e1c3..834f9f3723fb 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, ci, true); + fscache_check_consistency(ci->fscache); done: mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index da95f61b7a09..5ac591bd012b 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + static inline void ceph_fscache_invalidate(struct inode *inode) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 17543383545c..2e5e648eb5c3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -622,8 +622,10 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) { if (ci->i_auth_cap == NULL || - ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } ci->i_cap_exporting_issued = 0; } else { WARN_ON(ci->i_auth_cap == cap); @@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6d59006bfa27..16b54aa31f08 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { @@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry_dir), + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 45eda6d7a40c..766410a12c2c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r) * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *file, struct dir_context *ctx) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_dentry; @@ -133,8 +134,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx) last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, - last); + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); spin_lock(&parent->d_lock); @@ -161,7 +162,8 @@ more: goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!d_unhashed(dentry) && dentry->d_inode && + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) @@ -190,7 +192,7 @@ more: if (last) { /* remember our position */ fi->dentry = last; - fi->next_offset = di->offset; + fi->next_offset = fpos_off(di->offset); } dput(dentry); return 0; @@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = fsc->mount_options->max_readdir; - const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) @@ -291,8 +291,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(file, ctx); + err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; } else { @@ -322,14 +323,16 @@ more: fi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } req->r_inode = inode; ihold(inode); req->r_dentry = dget(file->f_dentry); @@ -340,9 +343,6 @@ more: req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -369,9 +369,9 @@ more: fi->next_offset = 0; off = fi->next_offset; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; - fi->frag = frag; if (req->r_reply_info.dir_end) { kfree(fi->last_name); @@ -454,7 +454,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -462,7 +462,10 @@ static void reset_readdir(struct ceph_file_info *fi) } kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; /* compensate for . and .. */ + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; if (fi->dentry) { dput(fi->dentry); fi->dentry = NULL; @@ -474,7 +477,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; mutex_lock(&inode->i_mutex); @@ -491,7 +494,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) goto out; } - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -504,14 +507,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) * seek to new frag, or seek prior to current chunk. */ if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || + fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(fi, fpos_frag(offset)); } /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) + if (fpos_cmp(offset, old_offset) > 0) fi->dir_release_count--; } out: @@ -812,8 +815,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry = dget(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -911,10 +913,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry_dir = old_dir; req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 16796be53ca5..00d6af6a32ec 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -8,23 +8,6 @@ #include "mds_client.h" /* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ - -/* * Basic fh */ struct ceph_nfs_fh { @@ -32,22 +15,12 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); -/* - * The presence of @parent_inode here tells us whether NFS wants a - * connectable file handle. However, we want to make a connectionable - * file handle unconditionally so that the MDS gets as much of a hint - * as possible. That means we only use @parent_dentry to indicate - * whether nfsd wants a connectable fh, and whether we should indicate - * failure from a too-small @max_len. - */ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { @@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry; - struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - dentry = d_find_alias(inode); + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } - /* if we found an alias, generate a connectable fh */ - if (*max_len >= connected_handle_length && dentry) { - dout("encode_fh %p connectable\n", dentry); - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); cfh->ino = ceph_ino(inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, - dentry); + cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; - type = 2; - spin_unlock(&dentry->d_lock); - } else if (*max_len >= handle_length) { - if (parent_inode) { - /* nfsd wants connectable */ - *max_len = connected_handle_length; - type = FILEID_INVALID; - } else { - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(inode); - *max_len = handle_length; - type = 1; - } + type = FILEID_INO32_GEN_PARENT; } else { + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); *max_len = handle_length; - type = FILEID_INVALID; + type = FILEID_INO32_GEN; } - if (dentry) - dput(dentry); return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh, int fh_len) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_vino vino; int err; - if (fh_len < sizeof(*fh) / 4) - return ERR_PTR(-ESTALE); - - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; + vino.ino = ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { @@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); return dentry; } /* - * convert connectable fh to dentry + * convert regular fh to dentry */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh, int fh_len) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; - struct ceph_vino vino; int err; - if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); - - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - inode = req->r_target_inode; - if (inode) - ihold(inode); - ceph_mdsc_put_request(req); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + if (child) { + req->r_inode = child->d_inode; + ihold(child->d_inode); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(child->d_inode) : ino, + dentry, ceph_vinop(inode)); return dentry; } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, - fh_len); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, - fh_len); + /* don't re-export snaps */ + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(child->d_inode)); + return __get_parent(child->d_sb, child, 0); } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); + return NULL; - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; - } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 596e6cc9f9c4..88a6df4cbe6d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file) ihold(inode); req->r_num_caps = 1; - if (flags & (O_CREAT|O_TRUNC)) + if (flags & O_CREAT) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); @@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } err = finish_open(file, dentry, ceph_open, opened); } - out_err: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); dout("atomic_open result=%d\n", err); return err; @@ -600,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } num_pages = calc_pages_for(page_align, len); @@ -718,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } /* @@ -970,6 +971,8 @@ retry_snap: goto retry_snap; } } else { + loff_t old_size = inode->i_size; + struct iov_iter from; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -977,9 +980,12 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); + iov_iter_init(&from, iov, nr_segs, count, 0); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); mutex_unlock(&inode->i_mutex); } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 32d519d8a2e2..0b0728e5be2d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); - } - ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -755,6 +747,14 @@ static int fill_inode(struct inode *inode, ci->i_max_offset = 2; } no_change: + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + spin_unlock(&ci->i_ceph_lock); /* queue truncate if we saw i_size decrease */ @@ -1044,10 +1044,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) - return err; + goto done; } else { WARN_ON_ONCE(1); } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } } if (rinfo->head->is_target) { @@ -1063,7 +1112,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = fill_inode(in, &rinfo->targeti, NULL, session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? + (!req->r_aborted && rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { @@ -1616,8 +1665,6 @@ static const struct inode_operations ceph_symlink_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, - .get_acl = ceph_get_acl, - .set_acl = ceph_set_acl, }; /* @@ -1627,7 +1674,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1819,9 +1865,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index dc66c9e023e4..fdf941b44ff1 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" @@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -121,9 +119,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae6d14e82b0f..d94ba0df9f4d 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,11 +2,31 @@ #include <linux/file.h> #include <linux/namei.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/pagelist.h> +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + /** * Implement fcntl and flock locking functions. */ @@ -14,11 +34,11 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) @@ -32,25 +52,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + if (lock_type == CEPH_LOCK_FCNTL) + owner = secure_addr(fl->fl_owner); + else + owner = secure_addr(fl->fl_file); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); - if ( operation == CEPH_MDS_OP_GETFILELOCK){ + if (operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; @@ -87,14 +109,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ - if (F_SETLKW == cmd) - wait = 1; - if (F_GETLK == cmd) + if (IS_GETLK(cmd)) op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; @@ -105,7 +132,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ + if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { @@ -131,20 +158,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { u8 lock_cmd; int err; - u8 wait = 1; - - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); - - /* set wait bit, then clear it out of cmd*/ - if (cmd & LOCK_NB) - wait = 0; - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); - /* set command sequence that Ceph wants to see: - shared lock, exclusive lock, or unlock */ - if (LOCK_SH == cmd) + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; - else if (LOCK_EX == cmd) + else if (F_WRLCK == fl->fl_type) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; @@ -280,13 +309,14 @@ int lock_to_ceph_filelock(struct file_lock *lock, struct ceph_filelock *cephlock) { int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + if (lock->fl_flags & FL_POSIX) + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + else + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f4f050a69a48..2b4d093d0563 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; - } + BUG_ON(!info->dir_in); info->dir_dname = (void *)(info->dir_in + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + info->dir_nr = num; while (num) { /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -327,7 +325,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); } @@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); @@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have @@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - dput(req->r_old_dentry); iput(req->r_old_dentry_dir); } kfree(req->r_path1); @@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, trim_caps - session->s_trim_caps); session->s_trim_caps = 0; } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); return 0; } @@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, dout("discard_cap_releases mds%d\n", session->s_mds); - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } /* requeue completed messages */ while (!list_empty(&session->s_cap_releases_done)) { @@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ @@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) + if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 68288917c737..e90cfccf93bd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed { /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; int dir_nr; char **dir_dname; u32 *dir_dname_len; @@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4440f447fd3f..51cc23e48111 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 10a4ccbf38da..06150fd745ac 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1026,6 +1026,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_flock_init(); ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d8801a95b685..7866cd05a6bb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -577,7 +577,7 @@ struct ceph_file_info { /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; @@ -871,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern const struct export_operations ceph_export_ops; /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a55ec37378c6..c9c2b887381e 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) } static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) + size_t size) { int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; + char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); down_read(&osdc->map_sem); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - pool_name); - else - ret = snprintf(val, size, + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout), (unsigned long long)pool); - + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } up_read(&osdc->map_sem); return ret; } @@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .name_size = sizeof("ceph.dir.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), @@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .name_size = sizeof("ceph.file.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(file, layout, stripe_unit), @@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2c70cbe35d39..5be1f997ecde 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } @@ -850,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ - .rename = cifs_rename, .permission = cifs_permission, #ifdef CONFIG_CIFS_XATTR .setxattr = cifs_setxattr, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..30f6e9251a4a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 834fce759d80..5ed03e0b8b40 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc = -EACCES; - loff_t lock_pos = pos; + loff_t lock_pos = iocb->ki_pos; - if (file->f_flags & O_APPEND) - lock_pos = i_size_read(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + lock_pos = i_size_read(inode); if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) - rc = generic_file_aio_write(iocb, iov, nr_segs, pos); + CIFS_WRITE_OP)) { + rc = __generic_file_aio_write(iocb, iov, nr_segs); + mutex_unlock(&inode->i_mutex); + + if (rc > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; + } + } else { + mutex_unlock(&inode->i_mutex); + } up_read(&cinode->lock_sem); return rc; } @@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data - * @iov: vector in which we should copy the data - * @nr_segs: number of segments in vector - * @offset: offset into file of the first iovec - * @copied: used to return the amount of data copied to the iov + * @iter: destination for our data * * This function copies data from a list of pages in a readdata response into * an array of iovecs. It will first calculate where the data should go * based on the info in the readdata and then copy the data into that spot. */ -static ssize_t -cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, - unsigned long nr_segs, loff_t offset, ssize_t *copied) +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - int rc = 0; - struct iov_iter ii; - size_t pos = rdata->offset - offset; - ssize_t remaining = rdata->bytes; - unsigned char *pdata; + size_t remaining = rdata->bytes; unsigned int i; - /* set up iov_iter and advance to the correct offset */ - iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); - iov_iter_advance(&ii, pos); - - *copied = 0; for (i = 0; i < rdata->nr_pages; i++) { - ssize_t copy; struct page *page = rdata->pages[i]; - - /* copy a whole page or whatever's left */ - copy = min_t(ssize_t, remaining, PAGE_SIZE); - - /* ...but limit it to whatever space is left in the iov */ - copy = min_t(ssize_t, copy, iov_iter_count(&ii)); - - /* go while there's data to be copied and no errors */ - if (copy && !rc) { - pdata = kmap(page); - rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset, - (int)copy); - kunmap(page); - if (!rc) { - *copied += copy; - remaining -= copy; - iov_iter_advance(&ii, copy); - } - } + size_t copy = min_t(size_t, remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; } - - return rc; + return remaining ? -EFAULT : 0; } static void @@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -static ssize_t -cifs_iovec_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { + struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = *poffset; + loff_t offset = pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; struct cifsFileInfo *open_file; struct cifs_readdata *rdata, *tmp; struct list_head rdata_list; + struct iov_iter to; pid_t pid; if (!nr_segs) @@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, if (!len) return 0; + iov_iter_init(&to, iov, nr_segs, len, 0); + INIT_LIST_HEAD(&rdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -2917,55 +2914,44 @@ error: if (!list_empty(&rdata_list)) rc = 0; + len = iov_iter_count(&to); /* the loop below should proceed in the order of increasing offsets */ -restart_loop: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + again: if (!rc) { - ssize_t copied; - /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) + else if (rdata->result) { rc = rdata->result; - else { - rc = cifs_readdata_to_iov(rdata, iov, - nr_segs, *poffset, - &copied); - total_read += copied; + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_retry_async_readv(rdata); + goto again; + } + } else { + rc = cifs_readdata_to_iov(rdata, &to); } - /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto restart_loop; - } } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } + total_read = len - iov_iter_count(&to); + cifs_stats_bytes_read(tcon, total_read); - *poffset += total_read; /* mask nodata case */ if (rc == -ENODATA) rc = 0; - return total_read ? total_read : rc; -} - -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t read; - - read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); - if (read > 0) - iocb->ki_pos = pos; - - return read; + if (total_read) { + iocb->ki_pos = pos + total_read; + return total_read; + } + return rc; } ssize_t @@ -3113,6 +3099,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -3644,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3651,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3689,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..35ddc3ed119d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..3802f8c94acc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a4232..0b2528fb640e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 66cba5a8a346..40707d88a945 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3144,6 +3144,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) end = ERR_PTR(-ENAMETOOLONG); return end; } +EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3190ca973dd6..1e5b45359509 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) } /* Data available on socket or listen socket received a connect */ -static void lowcomms_data_ready(struct sock *sk, int count_unused) +static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) diff --git a/fs/exec.c b/fs/exec.c index 25dfeba6d55f..476f3ebf437e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -812,7 +813,7 @@ EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { - ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; @@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -1043,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, char *buf) +void set_task_comm(struct task_struct *tsk, const char *buf) { task_lock(tsk); trace_task_rename(tsk, buf); @@ -1052,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } -static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) -{ - int i, ch; - - /* Copies the binary name from after last slash */ - for (i = 0; (ch = *(fn++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < len - 1) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; -} - int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1080,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; set_mm_exe_file(bprm->mm, bprm->file); - - filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1124,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - set_task_comm(current, bprm->tcomm); + set_task_comm(current, kbasename(bprm->filename)); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7682b970d0f1..4e2c032ab8a1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -21,12 +21,12 @@ #undef ORE_DBGMSG2 #define ORE_DBGMSG2 ORE_DBGMSG -struct page *_raid_page_alloc(void) +static struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); } -void _raid_page_free(struct page *p) +static void _raid_page_free(struct page *p) { __free_page(p); } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9d9763328734..ed73ed8ebbee 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { struct __alloc_ore_devs_and_exofs_devs { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 1b8001bbe947..27695e6f4e46 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -4,7 +4,6 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/capability.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7cadd823bb31..7d66fb0e4cca 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d260115c0350..3750031cfa2f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -192,7 +192,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index cfedb2cb0d8c..c0ebc4db8849 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; int err = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 22548f56197b..158b5d4ce067 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1727,10 +1727,7 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); if (fatal) goto out; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index e66e4808719f..17742eed2c16 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp) * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) * will be invalid once the directory was converted into a dx directory */ -loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 082afd78b107..a1b810230cc5 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index efce2bbfb5e5..f5157d0d1b43 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1925,6 +1883,8 @@ retry: * and pretend the write failed... */ ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -3212,21 +3172,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3238,13 +3197,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 95c6c5a6d0c5..08cdfe5461e3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -527,7 +527,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", sizeof(struct ext3_inode_info), diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3387664ad70e..722c2bf9645d 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bc765591101a..063fc1538355 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, overwrite = 1; } - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { @@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fa8da4cb8c4b..e93e4ec7d165 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); @@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } + switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 293d0486a40f..4aa521aa9bc3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) { cond_resched(); goto repeat; } - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); SetPageUptodate(page); return page; } @@ -75,23 +73,102 @@ out: return page; } +inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_SSA: + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_SSA: + case META_CP: + /* get ssa/cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + mark_page_accessed(page); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(sbi->por_doing)) goto redirty_out; - if (wbc->for_reclaim) goto redirty_out; - wait_on_page_writeback(page); + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } @@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - long written; - - if (wbc->for_kupdate) - return 0; + long diff, written; /* collect a number of dirty meta pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) - return 0; + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, nrpages); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); return 0; } @@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - f2fs_bug_on(page->mapping != mapping); - f2fs_bug_on(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; @@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); new->ino = ino; spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { spin_unlock(&sbi->orphan_inode_lock); kmem_cache_free(orphan_entry_slab, new); @@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (orphan->ino > ino) break; - orphan = NULL; } - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); spin_unlock(&sbi->orphan_inode_lock); } @@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } spin_unlock(&sbi->orphan_inode_lock); @@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1; orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) if (unlikely(entry->inode == inode)) return -EEXIST; - } + list_add_tail(&new->list, head); stat_inc_dirty_dir(sbi); return 0; @@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; @@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - - inc_page_count(sbi, F2FS_DIRTY_DENTS); + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void add_dirty_dir_inode(struct inode *inode) @@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - struct list_head *this, *head; + struct list_head *head; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + if (get_dirty_dents(inode)) { spin_unlock(&sbi->dir_inode_lock); return; } head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode == inode) { list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); stat_dec_dirty_dir(sbi); - break; + spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + goto done; } } spin_unlock(&sbi->dir_inode_lock); +done: /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); @@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode) struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *head; + struct list_head *head; struct inode *inode = NULL; + struct dir_inode_entry *entry; spin_lock(&sbi->dir_inode_lock); head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode->i_ino == ino) { inode = entry->inode; break; @@ -589,7 +680,7 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* @@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + stat_inc_cp_count(sbi->stat_info); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } @@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi) int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); + sizeof(struct orphan_inode_entry)); if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); + sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2261ccdd0b5f..45abd60e2bff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) static void f2fs_write_end_io(struct bio *bio, int err) { - struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; @@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err) if (unlikely(err)) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + f2fs_stop_checkpoint(sbi); } end_page_writeback(page); dec_page_count(sbi, F2FS_WRITEBACK); } - if (bio->bi_private) - complete(bio->bi_private); + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } if (!get_pages(sbi, F2FS_WRITEBACK) && !list_empty(&sbi->cp_wait.task_list)) @@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; return bio; } @@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) */ if (fio->type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); - io->bio->bi_private = &wait; + io->sbi->wait_io = &wait; submit_bio(rw, io->bio); wait_for_completion(&wait); } else { @@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { @@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); } /* @@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, verify_block_addr(sbi, blk_addr); - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); if (!is_read) inc_page_count(sbi, F2FS_WRITEBACK); @@ -202,7 +204,7 @@ alloc_new: io->last_block_in_bio = blk_addr; - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); } @@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page, */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + inode_dec_dirty_dents(inode); goto out; } zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (unlikely(sbi->por_doing)) { - err = AOP_WRITEPAGE_ACTIVATE; + if (unlikely(sbi->por_doing)) goto redirty_out; - } /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); - } else { - f2fs_lock_op(sbi); - - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { - err = f2fs_write_inline_data(inode, page, offset); - f2fs_unlock_op(sbi); - goto out; - } else { - err = do_write_data_page(page, &fio); - } + goto done; + } - f2fs_unlock_op(sbi); + if (!wbc->for_reclaim) need_balance_fs = true; - } - if (err == -ENOENT) - goto out; - else if (err) + else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; - if (wbc->for_reclaim) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - need_balance_fs = false; - } + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); out: @@ -849,12 +839,11 @@ out: redirty_out: wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); - return err; + return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) + goto skip_write; + + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); @@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping, remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, @@ -949,13 +942,19 @@ inline_data: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_read_inline_data(inode, page); - else + if (err) { + page_cache_release(page); + return err; + } + } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return err; + if (err) + return err; + } + lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); @@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 3de9d20d0c14..b52c12cf5873 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) @@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); @@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs: %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2b7c255bcbdf..972fd0ef230f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); } static unsigned int bucket_blocks(unsigned int level) @@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + const void *dentry_bits = &dentry_blk->dentry_bitmap; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { + if (!test_bit_le(bit_pos, dentry_bits)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, dentry_bits)) + max_len++; + bit_pos++; + continue; + } de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } @@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; - - wait_on_page_writeback(page); } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; - wait_on_page_writeback(page); set_cold_node(inode, page); } @@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); error: remove_inode_page(inode); return ERR_PTR(err); @@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -464,10 +467,11 @@ start: if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); @@ -487,8 +491,9 @@ start: ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -511,7 +516,12 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; @@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + + down_write(&F2FS_I(inode)->i_sem); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } + up_write(&F2FS_I(inode)->i_sem); update_inode_page(inode); if (inode->i_nlink == 0) @@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc3c558cb4f3..2ecac8312359 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -40,6 +40,7 @@ #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -88,6 +89,16 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT/SSA readahead + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -187,16 +198,20 @@ struct extent_info { #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define DEF_DIR_LEVEL 0 + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -229,6 +244,7 @@ struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -238,6 +254,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ @@ -300,6 +317,12 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ @@ -328,6 +351,14 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ }; /* @@ -378,7 +409,7 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct mutex io_mutex; /* mutex for bio */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ }; struct f2fs_sb_info { @@ -398,6 +429,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -407,7 +439,6 @@ struct f2fs_sb_info { struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ - bool on_build_free_nids; /* build_free_nids is doing */ wait_queue_head_t cp_wait; /* for orphan inode management */ @@ -436,6 +467,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { @@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } @@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * @@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) @@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) @@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, @@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DATA; } +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) { - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(&fi->vfs_inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } static inline void *inline_xattr_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } static inline int inline_xattr_size(struct inode *inode) { - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(inode)) return F2FS_INLINE_XATTR_ADDRS << 2; else return 0; @@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode) static inline void *inline_data_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[1]); } @@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -1097,6 +1141,7 @@ struct dnode_of_data; struct node_info; int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); @@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); @@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); @@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1231,7 +1280,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } +#define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) @@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) #define stat_inc_dirty_dir(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef53a6ed..60e7d5448a1d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -84,6 +84,7 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; @@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); + down_read(&fi->i_sem); /* * Both of fdatasync() and fsync() are able to be recovered from @@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + up_read(&fi->i_sem); + if (need_cp) { nid_t pino; - F2FS_I(inode)->xattr_ver = 0; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { F2FS_I(inode)->i_pino = pino; file_got_pino(inode); + up_write(&fi->i_sem); mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + } else { + up_write(&fi->i_sem); } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) @@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: - mutex_unlock(&inode->i_mutex); trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } @@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); return; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); @@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) ret = punch_hole(inode, offset, len); else @@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea0371e854b4..b90dbe55403a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); @@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; @@ -711,6 +708,11 @@ gc_more: goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); @@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 31ee5b164ff9..383db1fabcf4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + unlock_page(page); return PTR_ERR(ipage); + } zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 28cea76d78c6..ee829d360468 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); @@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page) clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - +retry: node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return 0; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * during the urgent cleaning time when runing out of free sections. */ f2fs_lock_op(sbi); - ret = update_inode_page(inode); + update_inode_page(inode); f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); - return ret; + return 0; } /* @@ -266,7 +273,7 @@ void f2fs_evict_inode(struct inode *inode) inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 397d459e97bf..a9409d19dfd4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); @@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) @@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b0649b76eb4f..a161e955c4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -21,9 +21,27 @@ #include "segment.h" #include <trace/events/f2fs.h> +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) +{ + struct sysinfo val; + unsigned long mem_size = 0; + + si_meminfo(&val); + if (type == FREE_NIDS) + mem_size = nm_i->fcnt * sizeof(struct free_nid); + else if (type == NAT_ENTRIES) + mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); + mem_size >>= 12; + + /* give 50:50 memory for free nids and nat caches respectively */ + return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - pgoff_t index; - int i; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (unlikely(nid >= nm_i->max_nid)) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - f2fs_submit_page_mbio(sbi, page, index, &fio); - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - f2fs_submit_merged_bio(sbi, META, READ); -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -185,13 +182,12 @@ retry: nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); nat_set_ino(e, le32_to_cpu(ne->ino)); nat_set_version(e, ne->version); - e->checkpointed = true; } write_unlock(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -205,7 +201,6 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* @@ -217,9 +212,6 @@ retry: f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && @@ -239,6 +231,11 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } @@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (available_free_memory(nm_i, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -763,7 +760,7 @@ skip_partial: f2fs_put_page(page, 1); goto restart; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + dn->nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); set_page_dirty(page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; @@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *page; int err; repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -959,7 +958,7 @@ repeat: goto got_it; lock_page(page); - if (unlikely(!PageUptodate(page))) { + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -968,7 +967,6 @@ repeat: goto repeat; } got_it: - f2fs_bug_on(nid != nid_of_node(page)); mark_page_accessed(page); return page; } @@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); if (TestClearPageError(page)) ret = -EIO; } @@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); @@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); unlock_page(page); @@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_NODES); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. - */ -#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + long diff; /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) - return 0; + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = 3 * max_hw_blocks(sbi); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); return 0; } @@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; - } - return NULL; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); - kmem_cache_free(free_nid_slab, i); + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) @@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) struct nat_entry *ne; bool allocated = false; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + if (!available_free_memory(nm_i, FREE_NIDS)) return -1; /* 0 nid should not be used */ @@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) /* do not add allocated nids */ read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } static void scan_nat_page(struct f2fs_nm_info *nm_i, @@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; @@ -1462,13 +1458,11 @@ retry: spin_lock(&nm_i->free_nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); + list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - } f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; @@ -1481,9 +1475,7 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = true; build_free_nids(sbi); - sbi->on_build_free_nids = false; mutex_unlock(&nm_i->build_lock); goto retry; } @@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; if (!nid) return; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + if (!available_free_memory(nm_i, FREE_NIDS)) { + __del_from_free_nid_list(nm_i, i); + need_free = true; } else { i->state = NID_NEW; nm_i->fcnt++; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } +void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!f2fs_has_inline_xattr(inode)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + recover_inline_xattr(inode, page); + + if (!f2fs_has_xattr_block(ofs_of_node(page))) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR, false); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; @@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); f2fs_put_page(ipage, 1); return 0; @@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, for (; page_idx < start + nrpages; page_idx++) { /* alloc temporal page for read node summary info*/ page = alloc_page(GFP_F2FS_ZERO); - if (!page) { - struct page *tmp; - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - unlock_page(page); - __free_pages(page, 0); - } - return -ENOMEM; - } + if (!page) + break; lock_page(page); page->index = page_idx; @@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, f2fs_submit_page_mbio(sbi, page, page->index, &fio); f2fs_submit_merged_bio(sbi, META, READ); - return 0; + + return page_idx - start; } int restore_node_summary(struct f2fs_sb_info *sbi, @@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi, addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); /* read ahead node pages */ - err = ra_sum_pages(sbi, &page_list, addr, nrpages); - if (err) - return err; + nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (!nrpages) + return -ENOMEM; list_for_each_entry_safe(page, tmp, &page_list, lru) { + if (err) + goto skip; lock_page(page); if (unlikely(!PageUptodate(page))) { @@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry->ofs_in_node = 0; sum_entry++; } - - list_del(&page->lru); unlock_page(page); +skip: + list_del(&page->lru); __free_pages(page, 0); } } @@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; block_t new_blkaddr; - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); - if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; @@ -1783,16 +1850,12 @@ flush_now: } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { f2fs_bug_on(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); @@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); @@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c4c79885c993..5decc1a375f0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,14 +17,11 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -45,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; @@ -58,9 +56,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0); #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0); #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, @@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +enum nid_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES /* indicates the cached nat entry */ +}; + /* * For free nid mangement */ @@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) return false; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 976a7a934db5..b1ae89f0f44e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } @@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); @@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct page *sum_page, *node_page; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; unsigned int offset; block_t bidx; int i; @@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { @@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (recover_inline_data(inode, page)) goto out; + if (recover_xattr_data(inode, page, blkaddr)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); @@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(ni.ino != ino_of_node(page)); @@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); + sizeof(struct fsync_inode_entry)); if (!fsync_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7caac5f2ca9e..085f548be7a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,6 +13,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/prefetch.h> +#include <linux/kthread.h> #include <linux/vmalloc.h> #include <linux/swap.h> @@ -24,6 +25,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *flush_cmd_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) f2fs_sync_fs(sbi->sb, true); } +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_sm_info *sm_i = SM_I(sbi); + wait_queue_head_t *q = &sm_i->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) { + sm_i->dispatch_list = sm_i->issue_list; + sm_i->issue_list = sm_i->issue_tail = NULL; + } + spin_unlock(&sm_i->issue_lock); + + if (sm_i->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = sm_i->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + sm_i->dispatch_list = NULL; + } + + wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + struct flush_cmd *cmd; + int ret; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); + cmd->next = NULL; + cmd->ret = 0; + init_completion(&cmd->wait); + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) + sm_i->issue_tail->next = cmd; + else + sm_i->issue_list = cmd; + sm_i->issue_tail = cmd; + spin_unlock(&sm_i->issue_lock); + + if (!sm_i->dispatch_list) + wake_up(&sm_i->flush_wait_queue); + + wait_for_completion(&cmd->wait); + ret = cmd->ret; + kmem_cache_free(flush_cmd_slab, cmd); + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->discard_list); - struct list_head *this, *next; - struct discard_entry *entry; + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ - list_for_each_safe(this, next, head) { - entry = list_entry(this, struct discard_entry, list); + list_for_each_entry_safe(entry, this, head, list) { f2fs_issue_discard(sbi, entry->blkaddr, entry->len); list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; @@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); @@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ @@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!io->bio) + goto out; + + bio_for_each_segment_all(bvec, io->bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { - f2fs_submit_merged_bio(sbi, type, WRITE); + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); } } @@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } @@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } @@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct page *page; - block_t blk_addr, prev_blk_addr = 0; - int sit_blk_cnt = SIT_BLK_CNT(sbi); - int blkno = start; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { - - blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - - if (blkno != start && prev_blk_addr + 1 != blk_addr) - break; - prev_blk_addr = blk_addr; -repeat: - page = grab_cache_page(mapping, blk_addr); - if (!page) { - cond_resched(); - goto repeat; - } - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - - f2fs_submit_merged_bio(sbi, META, READ); - return blkno - start; -} - static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); @@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); do { - readed = ra_sit_pages(sbi, start_blk, nrpages); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + dev_t dev = sbi->sb->s_bdev->bd_dev; struct f2fs_sm_info *sm_info; int err; @@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); - sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; sm_info->ipu_policy = F2FS_IPU_DISABLE; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; @@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; + if (test_opt(sbi, FLUSH_MERGE)) { + spin_lock_init(&sm_info->issue_lock); + init_waitqueue_head(&sm_info->flush_wait_queue); + + sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(sm_info->f2fs_issue_flush)) + return PTR_ERR(sm_info->f2fs_issue_flush); + } + err = build_sit_info(sbi); if (err) return err; @@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; + if (sm_info->f2fs_issue_flush) + kthread_stop(sm_info->f2fs_issue_flush); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) int __init create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", - sizeof(struct discard_entry), NULL); + sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; + flush_cmd_slab = f2fs_kmem_cache_create("flush_command", + sizeof(struct flush_cmd)); + if (!flush_cmd_slab) { + kmem_cache_destroy(discard_entry_slab); + return -ENOMEM; + } return 0; } void destroy_segment_manager_caches(void) { kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(flush_cmd_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5731682d7516..7091204680f4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,7 +14,7 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) -#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -57,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ @@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) @@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) struct request_queue *q = bdev_get_queue(bdev); return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); } + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 856bdf994c0a..c756923a7302 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -51,6 +51,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_inline_data, + Opt_flush_merge, Opt_err, }; @@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; @@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ }; @@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; return NULL; @@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), NULL, }; @@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (!strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) clear_opt(sbi, BG_GC); else { kfree(name); @@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); if (test_opt(F2FS_SB(sb), INLINE_XATTR)) set_inode_flag(fi, FI_INLINE_XATTR); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + if (test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) le32_to_cpu(sbi->raw_super->segment_count_main); int i; + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + for (i = 0; i < total_segs; i++) { - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); - if (i != 0 && (i % 10) == 0) - seq_puts(seq, "\n"); + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); else - seq_puts(seq, " "); + seq_putc(seq, ' '); } + return 0; } @@ -640,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); + if (unlikely(ino >= NM_I(sbi)->max_nid)) + return ERR_PTR(-ESTALE); /* * f2fs_iget isn't quite right if the inode is currently unallocated! @@ -787,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } /* @@ -898,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - mutex_init(&sbi->read_io.io_mutex); + init_rwsem(&sbi->read_io.io_rwsem); sbi->read_io.sbi = sbi; sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { - mutex_init(&sbi->write_io[i].io_mutex); + init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; sbi->write_io[i].bio = NULL; } @@ -991,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); - } - - /* - * If filesystem is not mounted as read-only then - * do start the gc_thread. - */ - if (!(sb->s_flags & MS_RDONLY)) { - /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); - if (err) - goto free_gc; - } - err = f2fs_build_stats(sbi); if (err) - goto free_gc; + goto free_root_inode; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1034,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); -free_gc: - stop_gc_thread(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -1084,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); + sizeof(struct f2fs_inode_info)); if (!f2fs_inode_cachep) return -ENOMEM; return 0; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 89d0422a91a8..503c2451131e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_size = inline_xattr_size(inode); - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) return NULL; @@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; name_len = strlen(name); + if (name_len > F2FS_NAME_LEN) + return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) @@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); return err; diff --git a/fs/file.c b/fs/file.c index b61293badfb1..8f294cfac697 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,7 +25,10 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, diff --git a/fs/file_table.c b/fs/file_table.c index 01071c4d752e..a374f5033e97 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; - - /* - * These mounts don't really matter in practice - * for r/o bind mounts. They aren't userspace- - * visible. We do this for consistency, and so - * that we can do debugging checks at __fput() - */ - if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - file_take_write(file); - WARN_ON(mnt_clone_write(path->mnt)); - } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -static void drop_file_write_access(struct file *file) -{ - struct vfsmount *mnt = file->f_path.mnt; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - - put_write_access(inode); - - if (special_file(inode->i_mode)) - return; - if (file_check_writeable(file) != 0) - return; - __mnt_drop_write(mnt); - file_release_write(file); -} - /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) @@ -253,8 +217,10 @@ static void __fput(struct file *file) put_pid(file->f_owner.pid); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITE) - drop_file_write_access(file); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } file->f_path.dentry = NULL; file->f_path.mnt = NULL; file->f_inode = NULL; @@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - files_defer_init(); percpu_counter_init(&nr_files, 0); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 23e363f38302..13b691a8a7d2 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -569,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev, return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); } -static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL); static ssize_t cuse_class_abort_store(struct device *dev, struct device_attribute *attr, @@ -580,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, fuse_abort_conn(&cc->fc); return count; } -static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); static struct attribute *cuse_class_dev_attrs[] = { &dev_attr_waiting.attr, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a648bb455ae..aac71ce373e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->currbuf; if (!cs->write) { - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + kunmap_atomic(cs->mapaddr); } else { - kunmap(buf->page); + kunmap_atomic(cs->mapaddr); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap(cs->pg); + kunmap_atomic(cs->mapaddr); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->mapaddr = kmap_atomic(buf->page); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap(page); + cs->mapaddr = kmap_atomic(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap(cs->pg); + cs->mapaddr = kmap_atomic(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = kmap_atomic(buf->page); cs->buf = cs->mapaddr + buf->offset; err = lock_request(cs->fc, cs->req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65df7d8be4f5..13f8bdec5110 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1086,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); @@ -1237,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, &iocb->ki_pos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) goto out; @@ -2117,6 +2114,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6c794085abac..80d67253623c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -494,6 +494,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6af66ee56390..4556ce1af5b0 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -93,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 16a5047903a6..406d9cc84ba8 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; @@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in, unsigned char *cpage_out, uint32_t srclen, uint32_t destlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index f73991522672..601afd1afddf 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r The umask is only applied if there's no default ACL */ ret = jffs2_init_acl_pre(dir_i, inode, &mode); if (ret) { - make_bad_inode(inode); - iput(inode); - return ERR_PTR(ret); + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); } ret = jffs2_do_new_inode (c, f, mode, ri); if (ret) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(ret); @@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_size = 0; if (insert_inode_locked(inode) < 0) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(-EINVAL); diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index e4619b00f7c5..fa35ff79ab35 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info uint32_t version; uint32_t data_crc; uint32_t partial_crc; - uint16_t csize; + uint32_t csize; uint16_t overlapped; }; diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 03310721712f..b6bd4affd9ad 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_unlock(&c->erase_completion_lock); schedule(); + remove_wait_queue(&c->erase_wait, &wait); } else spin_unlock(&c->erase_completion_lock); } else if (ret) @@ -211,20 +212,25 @@ out: int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *len, uint32_t sumsize) { - int ret = -EAGAIN; + int ret; minsize = PAD(minsize); jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); - spin_lock(&c->erase_completion_lock); - while(ret == -EAGAIN) { + while (true) { + spin_lock(&c->erase_completion_lock); ret = jffs2_do_reserve_space(c, minsize, len, sumsize); if (ret) { jffs2_dbg(1, "%s(): looping, ret is %d\n", __func__, ret); } + spin_unlock(&c->erase_completion_lock); + + if (ret == -EAGAIN) + cond_resched(); + else + break; } - spin_unlock(&c->erase_completion_lock); if (!ret) ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d93..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 10d6c41aecad..6bf06a07f3e0 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -235,6 +235,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); + svc_shutdown_net(serv, net); return err; } diff --git a/fs/mount.h b/fs/mount.h index b29e42f05f34..d55297f2fa05 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,7 +10,7 @@ struct mnt_namespace { struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; - int event; + u64 event; }; struct mnt_pcp { @@ -104,6 +104,9 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; }; #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) diff --git a/fs/namei.c b/fs/namei.c index 88339f59efb5..c6157c894fce 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -924,6 +926,7 @@ int follow_up(struct path *path) path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1223,6 +1227,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) @@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); /* * NB: most callers don't do anything directly with the reference to the @@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -3375,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3464,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3518,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3555,6 +3570,7 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { @@ -3672,6 +3688,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3785,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3893,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -4152,6 +4171,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) @@ -4304,11 +4324,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link) { - int len; - - len = PTR_ERR(link); + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4320,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4337,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4361,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -4376,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -4386,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4423,45 +4445,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); diff --git a/fs/namespace.c b/fs/namespace.c index 2ffc5a2905d4..182bc41cd887 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); -static int event; +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static DEFINE_SPINLOCK(mnt_id_lock); @@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file_inode(file); - - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); @@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + /* call under rcu_read_lock */ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { @@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); } @@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; @@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } -static void delayed_free(struct rcu_head *head) -{ - struct mount *mnt = container_of(head, struct mount, mnt_rcu); - kfree(mnt->mnt_devname); -#ifdef CONFIG_SMP - free_percpu(mnt->mnt_pcp); -#endif - kmem_cache_free(mnt_cache, mnt); -} - static void mntput_no_expire(struct mount *mnt) { put_again: @@ -991,7 +985,7 @@ put_again: dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } void mntput(struct vfsmount *mnt) @@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); if (err) goto out_cleanup_ids; - lock_mount_hash(); for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { @@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, return 0; out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: return err; @@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); + mnt_flags &= ~MNT_INTERNAL_FLAGS; mp = lock_mount(path); if (IS_ERR(mp)) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index c320ac52353e..08b8ea8c353e 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (val) goto finished; - DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n", + ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n", dentry, NCP_GET_AGE(dentry)); len = sizeof(__name); @@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } finfo.volume = finfo.i.volNumber; - DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n", + ncp_dbg(2, "looked for %pd/%s, res=%d\n", dentry->d_parent, __name, res); /* * If we didn't find it, or if it has a different dirEntNum to @@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_new_dentry(dentry); val=1; } else - DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); + ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); mutex_unlock(&inode->i_mutex); } finished: - DDPRINTK("ncp_lookup_validate: result=%d\n", val); + ncp_dbg(2, "result=%d\n", val); dput(parent); return val; } @@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) ctl.page = NULL; ctl.cache = NULL; - DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file, - (int) ctx->pos); + ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos); result = -EIO; /* Do not generate '.' and '..' when server is dead. */ @@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_entry_info entry; int i; - DPRINTK("ncp_read_volume_list: pos=%ld\n", - (unsigned long) ctx->pos); + ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos); for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { int inval_dentry; @@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, if (!strlen(info.volume_name)) continue; - DPRINTK("ncp_read_volume_list: found vol: %s\n", - info.volume_name); + ncp_dbg(1, "found vol: %s\n", info.volume_name); if (ncp_lookup_volume(server, info.volume_name, &entry.i)) { - DPRINTK("ncpfs: could not lookup vol %s\n", + ncp_dbg(1, "could not lookup vol %s\n", info.volume_name); continue; } @@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx, int more; size_t bufsize; - DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file, - (unsigned long) ctx->pos); - PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n", - file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); + ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos); + ncp_vdbg("init %pD, volnum=%d, dirent=%u\n", + file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); err = ncp_initialize_search(server, dir, &seq); if (err) { - DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); + ncp_dbg(1, "init failed, err=%d\n", err); return; } /* We MUST NOT use server->buffer_size handshaked with server if we are @@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb) goto out; result = -ENOENT; if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { - PPRINTK("ncp_conn_logged_in: %s not found\n", - server->m.mounted_vol); + ncp_vdbg("%s not found\n", server->m.mounted_vol); goto out; } dent = sb->s_root; @@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); } } else { - DPRINTK("ncpfs: sb->s_root == NULL!\n"); + ncp_dbg(1, "sb->s_root == NULL!\n"); } } else result = 0; @@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig if (!ncp_conn_valid(server)) goto finished; - PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry); + ncp_vdbg("server lookup for %pd2\n", dentry); len = sizeof(__name); if (ncp_is_server_root(dir)) { @@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig dentry->d_name.len, 1); if (!res) res = ncp_lookup_volume(server, __name, &(finfo.i)); - if (!res) - ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); } else { res = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); if (!res) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } - PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res); + ncp_vdbg("looked for %pd2, res=%d\n", dentry, res); /* * If we didn't find an entry, make a negative dentry. */ @@ -886,7 +881,7 @@ add_entry: } finished: - PPRINTK("ncp_lookup: result=%d\n", error); + ncp_vdbg("result=%d\n", error); return ERR_PTR(error); } @@ -909,7 +904,7 @@ out: return error; out_close: - PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry); + ncp_vdbg("%pd2 failed, closing file\n", dentry); ncp_close_file(NCP_SERVER(dir), finfo->file_handle); goto out; } @@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, int opmode; __u8 __name[NCP_MAXPATHLEN + 1]; - PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode); + ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, error = -ENAMETOOLONG; else if (result < 0) error = result; - DPRINTK("ncp_create: %pd2 failed\n", dentry); + ncp_dbg(1, "%pd2 failed\n", dentry); goto out; } opmode = O_WRONLY; @@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int error, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_mkdir: making %pd2\n", dentry); + ncp_dbg(1, "making %pd2\n", dentry); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) int error, result, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rmdir: removing %pd2\n", dentry); + ncp_dbg(1, "removing %pd2\n", dentry); len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, @@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) int error; server = NCP_SERVER(dir); - DPRINTK("ncp_unlink: unlinking %pd2\n", dentry); + ncp_dbg(1, "unlinking %pd2\n", dentry); /* * Check whether to close the file ... */ if (inode) { - PPRINTK("ncp_unlink: closing file\n"); + ncp_vdbg("closing file\n"); ncp_make_closed(inode); } @@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) #endif switch (error) { case 0x00: - DPRINTK("ncp: removed %pd2\n", dentry); + ncp_dbg(1, "removed %pd2\n", dentry); break; case 0x85: case 0x8A: @@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, int old_len, new_len; __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry); + ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); @@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, #endif switch (error) { case 0x00: - DPRINTK("ncp renamed %pd -> %pd.\n", - old_dentry, new_dentry); + ncp_dbg(1, "renamed %pd -> %pd\n", + old_dentry, new_dentry); break; case 0x9E: error = -ENAMETOOLONG; @@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); + ncp_dbg(1, "mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); } return -EPERM; /* Strange, but true */ diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 8f5074e1ecb9..77640a8bfb87 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -6,6 +6,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <asm/uaccess.h> #include <linux/time.h> @@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right) error = -EINVAL; if (!inode) { - printk(KERN_ERR "ncp_make_open: got NULL inode\n"); + pr_err("%s: got NULL inode\n", __func__); goto out; } - DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n", + ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n", atomic_read(&NCP_FINFO(inode)->opened), NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); @@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right) break; } if (result) { - PPRINTK("ncp_make_open: failed, result=%d\n", result); + ncp_vdbg("failed, result=%d\n", result); goto out_unlock; } /* @@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right) } access = NCP_FINFO(inode)->access; - PPRINTK("ncp_make_open: file open, access=%x\n", access); + ncp_vdbg("file open, access=%x\n", access); if (access == right || access == O_RDWR) { atomic_inc(&NCP_FINFO(inode)->opened); error = 0; @@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) void* freepage; size_t freelen; - DPRINTK("ncp_file_read: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); pos = *ppos; @@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) error = ncp_make_open(inode, O_RDONLY); if (error) { - DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error); + ncp_dbg(1, "open failed, error=%d\n", error); return error; } @@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) file_accessed(file); - DPRINTK("ncp_file_read: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_read ? already_read : error; @@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * int errno; void* bouncebuffer; - DPRINTK("ncp_file_write: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); if ((ssize_t) count < 0) return -EINVAL; pos = *ppos; @@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * return 0; errno = ncp_make_open(inode, O_WRONLY); if (errno) { - DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno); + ncp_dbg(1, "open failed, error=%d\n", errno); return errno; } bufsize = NCP_SERVER(inode)->buffer_size; @@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * i_size_write(inode, pos); mutex_unlock(&inode->i_mutex); } - DPRINTK("ncp_file_write: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_written ? already_written : errno; @@ -269,7 +271,7 @@ outrel: static int ncp_release(struct inode *inode, struct file *file) { if (ncp_make_closed(inode)) { - DPRINTK("ncp_release: failed to close\n"); + ncp_dbg(1, "failed to close\n"); } return 0; } diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 0af3349de851..03ffde1f44d6 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -2,6 +2,8 @@ * getopt.c */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/string.h> @@ -46,8 +48,8 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts if (opts->has_arg & OPT_NOPARAM) { return opts->val; } - printk(KERN_INFO "%s: the %s option requires an argument\n", - caller, token); + pr_info("%s: the %s option requires an argument\n", + caller, token); return -EINVAL; } if (opts->has_arg & OPT_INT) { @@ -57,18 +59,18 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts if (!*v) { return opts->val; } - printk(KERN_INFO "%s: invalid numeric value in %s=%s\n", + pr_info("%s: invalid numeric value in %s=%s\n", caller, token, val); return -EDOM; } if (opts->has_arg & OPT_STRING) { return opts->val; } - printk(KERN_INFO "%s: unexpected argument %s to the %s option\n", + pr_info("%s: unexpected argument %s to the %s option\n", caller, val, token); return -EINVAL; } } - printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token); + pr_info("%s: Unrecognized mount option %s\n", caller, token); return -EOPNOTSUPP; } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 647d86d2db39..e31e589369a4 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <asm/uaccess.h> @@ -133,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) NCP_FINFO(inode)->access = nwinfo->access; memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, sizeof(nwinfo->file_handle)); - DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n", + ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n", nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); } @@ -141,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) { /* NFS namespace mode overrides others if it's set. */ - DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n", - nwi->entryName, nwi->nfs.mode); + ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode); if (nwi->nfs.mode) { /* XXX Security? */ inode->i_mode = nwi->nfs.mode; @@ -230,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) ncp_update_attrs(inode, nwinfo); - DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); + ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode); set_nlink(inode, 1); inode->i_uid = server->m.uid; @@ -258,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) struct inode *inode; if (info == NULL) { - printk(KERN_ERR "ncp_iget: info is NULL\n"); + pr_err("%s: info is NULL\n", __func__); return NULL; } @@ -290,7 +291,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) } insert_inode_hash(inode); } else - printk(KERN_ERR "ncp_iget: iget failed!\n"); + pr_err("%s: iget failed!\n", __func__); return inode; } @@ -301,12 +302,12 @@ ncp_evict_inode(struct inode *inode) clear_inode(inode); if (S_ISDIR(inode->i_mode)) { - DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); + ncp_dbg(2, "put directory %ld\n", inode->i_ino); } if (ncp_make_closed(inode) != 0) { /* We can't do anything but complain. */ - printk(KERN_ERR "ncp_evict_inode: could not close\n"); + pr_err("%s: could not close\n", __func__); } } @@ -469,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) { struct ncp_mount_data_kernel data; struct ncp_server *server; - struct file *ncp_filp; struct inode *root_inode; - struct inode *sock_inode; struct socket *sock; int error; int default_bufsize; @@ -540,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || !gid_valid(data.gid)) goto out; - error = -EBADF; - ncp_filp = fget(data.ncp_fd); - if (!ncp_filp) - goto out; - error = -ENOTSOCK; - sock_inode = file_inode(ncp_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput; - sock = SOCKET_I(sock_inode); + sock = sockfd_lookup(data.ncp_fd, &error); if (!sock) - goto out_fput; - + goto out; + if (sock->type == SOCK_STREAM) default_bufsize = 0xF000; else @@ -573,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (error) goto out_fput; - server->ncp_filp = ncp_filp; server->ncp_sock = sock; if (data.info_fd != -1) { - struct socket *info_sock; - - error = -EBADF; - server->info_filp = fget(data.info_fd); - if (!server->info_filp) - goto out_bdi; - error = -ENOTSOCK; - sock_inode = file_inode(server->info_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput2; - info_sock = SOCKET_I(sock_inode); + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_fput2; + goto out_bdi; + server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) goto out_fput2; - server->info_sock = info_sock; } /* server->lock = 0; */ @@ -621,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) now because of PATH_MAX changes.. */ if (server->m.time_out < 1) { server->m.time_out = 10; - printk(KERN_INFO "You need to recompile your ncpfs utils..\n"); + pr_info("You need to recompile your ncpfs utils..\n"); } server->m.time_out = server->m.time_out * HZ / 100; server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; @@ -682,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) ncp_unlock_server(server); if (error < 0) goto out_rxbuf; - DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb)); + ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb)); error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ #ifdef CONFIG_NCPFS_PACKET_SIGNING @@ -710,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (ncp_negotiate_buffersize(server, default_bufsize, &(server->buffer_size)) != 0) goto out_disconnect; - DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size); + ncp_dbg(1, "bufsize = %d\n", server->buffer_size); memset(&finfo, 0, sizeof(finfo)); finfo.i.attributes = aDIR; @@ -739,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) root_inode = ncp_iget(sb, &finfo); if (!root_inode) goto out_disconnect; - DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); + ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber); sb->s_root = d_make_root(root_inode); if (!sb->s_root) goto out_disconnect; @@ -765,17 +745,12 @@ out_nls: mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); out_fput2: - if (server->info_filp) - fput(server->info_filp); + if (server->info_sock) + sockfd_put(server->info_sock); out_bdi: bdi_destroy(&server->bdi); out_fput: - /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: - * - * The previously used put_filp(ncp_filp); was bogus, since - * it doesn't perform proper unlocking. - */ - fput(ncp_filp); + sockfd_put(sock); out: put_pid(data.wdog_pid); sb->s_fs_info = NULL; @@ -808,9 +783,9 @@ static void ncp_put_super(struct super_block *sb) mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); - if (server->info_filp) - fput(server->info_filp); - fput(server->ncp_filp); + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); @@ -985,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) != 0) { int written; - DPRINTK("ncpfs: trying to change size to %ld\n", - attr->ia_size); + ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size); if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { result = -EACCES; @@ -1072,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { int err; - DPRINTK("ncpfs: init_ncp_fs called\n"); + ncp_dbg(1, "called\n"); err = init_inodecache(); if (err) @@ -1089,7 +1063,7 @@ out1: static void __exit exit_ncp_fs(void) { - DPRINTK("ncpfs: exit_ncp_fs called\n"); + ncp_dbg(1, "called\n"); unregister_filesystem(&ncp_fs_type); destroy_inodecache(); } diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60426ccb3b65..d5659d96ee7f 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info.version != NCP_GET_FS_INFO_VERSION) { - DPRINTK("info.version invalid: %d\n", info.version); + ncp_dbg(1, "info.version invalid: %d\n", info.version); return -EINVAL; } /* TODO: info.addr = server->m.serv_addr; */ @@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg else result = server->reply_size; ncp_unlock_server(server); - DPRINTK("ncp_ioctl: copy %d bytes\n", - result); + ncp_dbg(1, "copy %d bytes\n", result); if (result >= 0) if (copy_to_user(request.data, bouncebuffer, result)) result = -EFAULT; @@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); } else - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); } else { sr.volNumber = -1; sr.namespace = 0; @@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); result = -EIO; } } else { - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); result = -EIO; } } diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 3c5dd55d284c..b359d12eb359 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - DPRINTK("ncp_mmap: called\n"); + ncp_dbg(1, "called\n"); if (!ncp_conn_valid(NCP_SERVER(inode))) return -EIO; diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h index 31831afe1c3b..b9f69e1b1f43 100644 --- a/fs/ncpfs/ncp_fs.h +++ b/fs/ncpfs/ncp_fs.h @@ -2,30 +2,32 @@ #include "ncp_fs_i.h" #include "ncp_fs_sb.h" -/* define because it is easy to change PRINTK to {*}PRINTK */ -#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args) - #undef NCPFS_PARANOIA #ifdef NCPFS_PARANOIA -#define PPRINTK(format, args...) PRINTK(format , ## args) +#define ncp_vdbg(fmt, ...) \ + pr_debug(fmt, ##__VA_ARGS__) #else -#define PPRINTK(format, args...) +#define ncp_vdbg(fmt, ...) \ +do { \ + if (0) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #endif #ifndef DEBUG_NCP #define DEBUG_NCP 0 #endif -#if DEBUG_NCP > 0 -#define DPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DPRINTK(format, args...) -#endif -#if DEBUG_NCP > 1 -#define DDPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DDPRINTK(format, args...) + +#if DEBUG_NCP > 0 && !defined(DEBUG) +#define DEBUG #endif +#define ncp_dbg(level, fmt, ...) \ +do { \ + if (level <= DEBUG_NCP) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + #define NCP_MAX_RPC_TIMEOUT (6*HZ) diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index b81e97adc5a9..55e26fd80886 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -45,9 +45,7 @@ struct ncp_server { __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - struct file *ncp_filp; /* File pointer to ncp socket */ struct socket *ncp_sock;/* ncp socket */ - struct file *info_filp; struct socket *info_sock; u8 sequence; @@ -111,7 +109,7 @@ struct ncp_server { spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ - void (*data_ready)(struct sock* sk, int len); + void (*data_ready)(struct sock* sk); void (*error_report)(struct sock* sk); void (*write_space)(struct sock* sk); /* STREAM mode only */ struct { @@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); extern void ncpdgram_timeout_call(unsigned long server); -extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 981a95617fc9..482387532f54 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -9,14 +9,14 @@ * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ncp_fs.h" static inline void assert_server_locked(struct ncp_server *server) { if (server->lock == 0) { - DPRINTK("ncpfs: server not locked!\n"); + ncp_dbg(1, "server not locked!\n"); } } @@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s) int len = strlen(s); assert_server_locked(server); if (len > 255) { - DPRINTK("ncpfs: string too long: %s\n", s); + ncp_dbg(1, "string too long: %s\n", s); len = 255; } ncp_add_byte(server, len); @@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server, result = -EIO; len = ncp_reply_byte(server, 29); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); @@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n, result = -EIO; len = ncp_reply_byte(server, 21); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); @@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode) err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); if (!err) - PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n", - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum, err); + ncp_vdbg("volnum=%d, dirent=%u, error=%d\n", + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, err); } mutex_unlock(&NCP_FINFO(inode)->open_mutex); return err; @@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server, if ((result = ncp_request(server, 87)) == 0) { ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); - DPRINTK(KERN_DEBUG - "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n", + ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n", target->entryName, target->nfs.mode, target->nfs.rdev); } else { @@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa int result; if (target == NULL) { - printk(KERN_ERR "ncp_obtain_info: invalid call\n"); + pr_err("%s: invalid call\n", __func__); return -EINVAL; } ncp_init_request(server); @@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume) namespace = ncp_reply_data(server, 2); while (no_namespaces > 0) { - DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume); + ncp_dbg(1, "found %d on %d\n", *namespace, volume); #ifdef CONFIG_NCPFS_NFS_NS if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) @@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) if (ret_ns) *ret_ns = ns; - DPRINTK("lookup_vol: namespace[%d] = %d\n", - volume, server->name_space[volume]); + ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]); if (server->name_space[volume] == ns) return 0; @@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server, { int result; - DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); + ncp_dbg(1, "looking up vol %s\n", volname); ncp_init_request(server); ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 3a1587222c8a..471bc3d1139e 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -8,6 +8,7 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/time.h> #include <linux/errno.h> @@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req) kfree(req); } -void ncp_tcp_data_ready(struct sock *sk, int len) +void ncp_tcp_data_ready(struct sock *sk) { struct ncp_server *server = sk->sk_user_data; - server->data_ready(sk, len); + server->data_ready(sk); schedule_work(&server->rcv.tq); } @@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server) return; if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result); + pr_err("tcp: Send failed: %d\n", result); __ncp_abort_request(server, rq, result); return; } @@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply * mutex_lock(&server->rcv.creq_mutex); if (!ncp_conn_valid(server)) { mutex_unlock(&server->rcv.creq_mutex); - printk(KERN_ERR "ncpfs: tcp: Server died\n"); + pr_err("tcp: Server died\n"); return -EIO; } ncp_req_get(req); @@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work) } result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); if (result < 0) { - DPRINTK("recv failed with %d\n", result); + ncp_dbg(1, "recv failed with %d\n", result); continue; } if (result < 10) { - DPRINTK("too short (%u) watchdog packet\n", result); + ncp_dbg(1, "too short (%u) watchdog packet\n", result); continue; } if (buf[9] != '?') { - DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]); + ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]); continue; } buf[9] = 'Y'; @@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work) result -= 8; hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { - printk(KERN_INFO "ncpfs: Signature violation\n"); + pr_info("Signature violation\n"); result = -EIO; } } @@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) return result; } if (result > len) { - printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len); + pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len); return -EIO; } return result; @@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) __ncptcp_abort(server); } if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result); + pr_err("tcp: error in recvmsg: %d\n", result); } else { - DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n"); + ncp_dbg(1, "tcp: EOF\n"); } return -EIO; } @@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) switch (server->rcv.state) { case 0: if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); + pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); __ncptcp_abort(server); return -EIO; } datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; if (datalen < 10) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active) { if (datalen < 18) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } @@ -604,7 +605,7 @@ cont:; server->rcv.len = datalen - 10; break; } - DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type); + ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type); skipdata2:; server->rcv.state = 2; skipdata:; @@ -614,11 +615,11 @@ skipdata:; } req = server->rcv.creq; if (!req) { - DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n"); + ncp_dbg(1, "Reply without appropriate request\n"); goto skipdata2; } if (datalen > req->datalen + 8) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); + pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); server->rcv.state = 3; goto skipdata; } @@ -638,12 +639,12 @@ skipdata:; req = server->rcv.creq; if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { - printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n"); + pr_err("tcp: Bad sequence number\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { - printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n"); + pr_err("tcp: Connection number mismatch\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -651,7 +652,7 @@ skipdata:; #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { - printk(KERN_ERR "ncpfs: tcp: Signature violation\n"); + pr_err("tcp: Signature violation\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size, int result; if (server->lock == 0) { - printk(KERN_ERR "ncpfs: Server not locked!\n"); + pr_err("Server not locked!\n"); return -EIO; } if (!ncp_conn_valid(server)) { @@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - DDPRINTK("do_ncp_rpc_call returned %d\n", result); + ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result); return result; } @@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function, result = ncp_do_request(server, server->current_size, reply, size); if (result < 0) { - DPRINTK("ncp_request_error: %d\n", result); + ncp_dbg(1, "ncp_request_error: %d\n", result); goto out; } server->completion = reply->completion_code; @@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function, result = reply->completion_code; if (result != 0) - PPRINTK("ncp_request: completion code=%x\n", result); + ncp_vdbg("completion code=%x\n", result); out: return result; } @@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server) { mutex_lock(&server->mutex); if (server->lock) - printk(KERN_WARNING "ncp_lock_server: was locked!\n"); + pr_warn("%s: was locked!\n", __func__); server->lock = 1; } void ncp_unlock_server(struct ncp_server *server) { if (!server->lock) { - printk(KERN_WARNING "ncp_unlock_server: was not locked!\n"); + pr_warn("%s: was not locked!\n", __func__); return; } server->lock = 0; diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 52439ddc8de0..1a63bfdb4a65 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { __le32 attr; unsigned int hdr; - DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname); + ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname); if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) kludge = 0; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ae2e87b95453..41db5258e7a7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -112,7 +112,8 @@ out: * TODO: keep track of all layouts (and delegations) in a hash table * hashed by filehandle. */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct nfs_server *server; struct inode *ino; @@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + continue; if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) continue; ino = igrab(lo->plh_inode); if (!ino) - continue; + break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ if (NFS_I(ino)->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - continue; + break; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, return NULL; } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh); + lo = get_layout_by_fh_locked(clp, fh, stateid); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh); + lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); if (!lo) - return NFS4ERR_NOMATCHING_LAYOUT; + goto out; ino = lo->plh_inode; spin_lock(&ino->i_lock); @@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); iput(ino); +out: return rv; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4a48fe4b84b6..d9f3d067cd15 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = { static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { + struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->duped = 0; - ctx->attr_gencount = NFS_I(dir)->attr_gencount; + ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); + spin_lock(&dir->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } -static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { + spin_lock(&dir->i_lock); + list_del(&ctx->list); + spin_unlock(&dir->i_lock); put_rpccred(ctx->cred); kfree(ctx); } @@ -126,7 +133,7 @@ out: static int nfs_closedir(struct inode *inode, struct file *filp) { - put_nfs_open_dir_context(filp->private_data); + put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); return 0; } @@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, - array->array[i].string.name, - *desc->dir_cookie); + "The file: %.*s has duplicate cookie %llu\n", + desc->file, array->array[i].string.len, + array->array[i].string.name, *desc->dir_cookie); } status = -ELOOP; goto out; @@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); } +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ + if (!list_empty(&NFS_I(dir)->open_files)) { + nfs_advise_use_readdirplus(dir); + nfs_zap_mapping(dir, dir->i_mapping); + } +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { @@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_attribute_cache_expired(dir)) + return true; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return true; + return false; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct dentry *dentry = NULL, *rehash = NULL; + struct rpc_task *task; int error = -EBUSY; dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", @@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); nfs_mark_for_revalidate(old_inode); out: if (rehash) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5bb790a69c71..284ca901fe16 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c4702baa22b8..0c438973f3c8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ + struct dentry *parent; + + parent = dget_parent(dentry); + nfs_force_use_readdirplus(parent->d_inode); + dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ + if (NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + return true; + if (nfs_attribute_cache_expired(inode)) + return true; + return false; +} + int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; - if (need_atime) - err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (need_atime || nfs_need_revalidate_inode(inode)) { + struct nfs_server *server = NFS_SERVER(inode); + + if (server->caps & NFS_CAP_READDIRPLUS) + nfs_request_parent_use_readdirplus(dentry); + err = __nfs_revalidate_inode(server, inode); + } if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - && !nfs_attribute_cache_expired(inode)) + if (!nfs_need_revalidate_inode(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b46cf5a67329..dd8bfc2e2464 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const char *ip_addr); /* dir.c */ +extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a462ef0fb5d6..db60149c4579 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct nfs_renameres res; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - - res.old_fattr = nfs_alloc_fattr(); - res.new_fattr = nfs_alloc_fattr(); - if (res.old_fattr == NULL || res.new_fattr == NULL) - goto out; - - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, res.old_fattr); - nfs_post_op_update_inode(new_dir, res.new_fattr); -out: - nfs_free_fattr(res.old_fattr); - nfs_free_fattr(res.new_fattr); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs3_linkargs arg = { @@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .unlink_setup = nfs3_proc_unlink_setup, .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, .unlink_done = nfs3_proc_unlink_done, - .rename = nfs3_proc_rename, .rename_setup = nfs3_proc_rename_setup, .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, .rename_done = nfs3_proc_rename_done, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a5b27c2d9689..e1d1badbe53c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); @@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0e46d3d1b6cc..aa9ef4876046 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new, *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); + goto out; + case -ERESTARTSYS: + case -ETIMEDOUT: + /* The callback path may have been inadvertently + * changed. Schedule recovery! + */ + nfs4_schedule_path_down_recovery(pos); default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 450bfedbe2f4..397be39c6dc8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref) dput(p->dentry); nfs_sb_deactive(sb); nfs_fattr_free_names(&p->f_attr); + kfree(p->f_attr.mdsthreshold); kfree(p); } @@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ + struct nfs_client *clp = state->owner->so_server->nfs_client; + bool need_recover = false; + + if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) + need_recover = true; + if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) + need_recover = true; + if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) + need_recover = true; + if (need_recover) + nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, + nfs4_stateid *stateid) +{ + if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) + return true; + if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs_test_and_clear_all_open_stateid(state); + return true; + } + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + return true; + return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *stateid, fmode_t fmode) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_READ: + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + if (stateid == NULL) + return; + if (!nfs_need_update_open_stateid(state, stateid)) + return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); - set_bit(NFS_OPEN_STATE, &state->flags); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_clear_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } -} - -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) -{ - write_seqlock(&state->seqlock); - nfs_set_open_stateid_locked(state, stateid, fmode); - write_sequnlock(&state->seqlock); + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); } static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -1217,6 +1275,8 @@ no_delegation: __update_open_stateid(state, open_stateid, NULL, fmode); ret = 1; } + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); return ret; } @@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * struct nfs4_state *newstate; int ret; + /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { - clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { - clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { - clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; @@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir, } } - if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { - opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); - if (!opendata->f_attr.mdsthreshold) - goto err_free_label; + if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + if (!opendata->f_attr.mdsthreshold) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_free_label; + } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) @@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir, if (opendata->file_created) *opened |= FILE_CREATED; - if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { *ctx_th = opendata->f_attr.mdsthreshold; - else - kfree(opendata->f_attr.mdsthreshold); - opendata->f_attr.mdsthreshold = NULL; + opendata->f_attr.mdsthreshold = NULL; + } nfs4_label_free(olabel); @@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir, err_free_label: nfs4_label_free(olabel); err_opendata_put: - kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); @@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data) kfree(calldata); } -static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, - fmode_t fmode) -{ - spin_lock(&state->owner->so_lock); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - switch (fmode & (FMODE_READ|FMODE_WRITE)) { - case FMODE_WRITE: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - break; - case FMODE_READ: - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - break; - case 0: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); - } - spin_unlock(&state->owner->so_lock); -} - static void nfs4_close_done(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; @@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_set_open_stateid(state, &calldata->res.stateid, 0); + nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - break; + goto out_release; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs4_close_clear_stateid_flags(state, calldata->arg.fmode); + nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 1; } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .new_dir = NFS_FH(new_dir), - .old_name = old_name, - .new_name = new_name, - }; - struct nfs_renameres res = { - .server = server, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); - if (!status) { - update_changeattr(old_dir, &res.old_cinfo); - update_changeattr(new_dir, &res.new_cinfo); - } - return status; -} - -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_rename(old_dir, old_name, - new_dir, new_name); - trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err); - err = nfs4_handle_exception(NFS_SERVER(old_dir), err, - &exception); - } while (exception.retry); - return err; -} - static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); @@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp, nodename); } +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services. Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ + if (strchr(clp->cl_ipaddr, ':') != NULL) + return scnprintf(buf, len, "tcp6"); + else + return scnprintf(buf, len, "tcp"); +} + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, setclientid.sc_name, sizeof(setclientid.sc_name)); /* cb_client4 */ - rcu_read_lock(); - setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), "%s", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_NETID)); - rcu_read_unlock(); + setclientid.sc_netid_len = + nfs4_init_callback_netid(clp, + setclientid.sc_netid, + sizeof(setclientid.sc_netid)); setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); @@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .unlink_setup = nfs4_proc_unlink_setup, .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, .unlink_done = nfs4_proc_unlink_done, - .rename = nfs4_proc_rename, .rename_setup = nfs4_proc_rename_setup, .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, .rename_done = nfs4_proc_rename_done, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0deb32105ccf..2349518eef2c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st return 1; } -static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -2075,8 +2075,10 @@ again: switch (status) { case 0: break; - case -NFS4ERR_DELAY: case -ETIMEDOUT: + if (clnt->cl_softrtry) + break; + case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); case -NFS4ERR_STALE_CLIENTID: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 72f3bf1754ef..73ce8d4fe2c8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int); 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz + \ - nfs4_label_maxsz + nfs4_fattr_maxsz) + decode_verifier_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) #define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4755858e37a0..cb53d450ae32 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) */ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) { - return (s32)s1 - (s32)s2 > 0; + return (s32)(s1 - s2) > 0; +} + +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + struct list_head *free_me_list) +{ + if (nfs4_stateid_match_other(&lo->plh_stateid, new)) + return; + /* Layout is new! Kill existing layout segments */ + pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); } /* update lo->plh_stateid with new if is more recent */ @@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; + LIST_HEAD(free_me); int status = 0; /* Inject layout blob into I/O device driver */ @@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } + /* Check that the new stateid matches the old stateid */ + pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); /* Done processing layoutget. Set the layout stateid */ pnfs_set_layout_stateid(lo, &res->stateid, false); @@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); return lseg; out: return ERR_PTR(status); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index fddbba2d9eff..e55ce9e8b034 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct rpc_message msg = { - .rpc_proc = &nfs_procedures[NFSPROC_RENAME], - .rpc_argp = &arg, - }; - int status; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_mark_for_revalidate(old_dir); - nfs_mark_for_revalidate(new_dir); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_linkargs arg = { @@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .unlink_setup = nfs_proc_unlink_setup, .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, .unlink_done = nfs_proc_unlink_done, - .rename = nfs_proc_rename, .rename_setup = nfs_proc_rename_setup, .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, .rename_done = nfs_proc_rename_done, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 11d78944de79..de54129336c6 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/namei.h> +#include <linux/fsnotify.h> #include "internal.h" #include "nfs4_fs.h" @@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) return; } - if (task->tk_status != 0) - nfs_cancel_async_unlink(old_dentry); + if (data->complete) + data->complete(task, data); } /** @@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = { * * It's expected that valid references to the dentries and inodes are held */ -static struct rpc_task * +struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, - struct dentry *old_dentry, struct dentry *new_dentry) + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)) { struct nfs_renamedata *data; struct rpc_message msg = { }; @@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data->new_dentry = dget(new_dentry); nfs_fattr_init(&data->old_fattr); nfs_fattr_init(&data->new_fattr); + data->complete = complete; /* set up nfs_renameargs */ data->args.old_dir = NFS_FH(old_dir); @@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, return rpc_run_task(&task_setup_data); } +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *dentry = data->old_dentry; + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(dentry); + return; + } + + /* + * vfs_unlink and the like do not issue this when a file is + * sillyrenamed, so do it here. + */ + fsnotify_nameremove(dentry, 0); +} + #define SILLYNAME_PREFIX ".nfs" #define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) #define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) @@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) } /* run the rename task, undo unlink if it fails */ - task = nfs_async_rename(dir, dir, dentry, sdentry); + task = nfs_async_rename(dir, dir, dentry, sdentry, + nfs_complete_sillyrename); if (IS_ERR(task)) { error = -EBUSY; nfs_cancel_async_unlink(dentry); diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index a812fd1b92a4..b481e1f5eecc 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,9 +39,13 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to - * fit in a page: */ -#define NFS4_ACL_MAX 170 +/* + * Maximum ACL we'll accept from a client; chosen (somewhat + * arbitrarily) so that kmalloc'ing the ACL shouldn't require a + * high-order allocation. This allows 204 ACEs on x86_64: + */ +#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ + / sizeof(struct nfs4_ace)) struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index d190e33d0ec2..6f3f392d48af 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -542,7 +542,10 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * up setting a 3-element effective posix ACL with all * permissions zero. */ - nace = 4 + state->users->n + state->groups->n; + if (!state->users->n && !state->groups->n) + nace = 3; + else /* Note we also include a MASK ACE in this case: */ + nace = 4 + state->users->n + state->groups->n; pacl = posix_acl_alloc(nace, GFP_KERNEL); if (!pacl) return ERR_PTR(-ENOMEM); @@ -586,9 +589,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - pace++; - pace->e_tag = ACL_MASK; - low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + if (!state->users->n && !state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + } pace++; pace->e_tag = ACL_OTHER; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7f05cd140de3..39c8ef875f91 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -32,6 +32,7 @@ */ #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include "nfsd.h" @@ -635,6 +636,22 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } +static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) +{ + struct rpc_xprt *xprt; + + if (args->protocol != XPRT_TRANSPORT_BC_TCP) + return rpc_create(args); + + xprt = args->bc_xprt->xpt_bc_xprt; + if (xprt) { + xprt_get(xprt); + return rpc_create_xprt(args, xprt); + } + + return rpc_create(args); +} + static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { struct rpc_timeout timeparms = { @@ -674,7 +691,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = rpc_create(&args); + client = create_backchannel_client(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 82189b208af3..d543222babf3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1273,6 +1273,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_op *op; struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; + struct svc_fh *current_fh = &cstate->current_fh; + struct svc_fh *save_fh = &cstate->save_fh; int slack_bytes; u32 plen = 0; __be32 status; @@ -1288,11 +1290,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->tag = args->tag; resp->opcnt = 0; resp->rqstp = rqstp; - resp->cstate.minorversion = args->minorversion; - resp->cstate.replay_owner = NULL; - resp->cstate.session = NULL; - fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); - fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + cstate->minorversion = args->minorversion; + cstate->replay_owner = NULL; + cstate->session = NULL; + fh_init(current_fh, NFS4_FHSIZE); + fh_init(save_fh, NFS4_FHSIZE); /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. @@ -1345,20 +1347,28 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc = OPDESC(op); - if (!cstate->current_fh.fh_dentry) { + if (!current_fh->fh_dentry) { if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } - } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && + } else if (current_fh->fh_export->ex_fslocs.migrated && !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } + fh_clear_wcc(current_fh); + /* If op is non-idempotent */ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { plen = opdesc->op_rsize_bop(rqstp, op); + /* + * If there's still another operation, make sure + * we'll have space to at least encode an error: + */ + if (resp->opcnt < args->opcnt) + plen += COMPOUND_ERR_SLACK_SPACE; op->status = nfsd4_check_resp_size(resp, plen); } @@ -1377,12 +1387,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, clear_current_stateid(cstate); if (need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + op->status = check_nfsd_access(current_fh->fh_export, rqstp); } encode_op: /* Only from SEQUENCE */ - if (resp->cstate.status == nfserr_replay_cache) { + if (cstate->status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); status = op->status; goto out; @@ -1411,10 +1421,10 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - resp->cstate.status = status; - fh_put(&resp->cstate.current_fh); - fh_put(&resp->cstate.save_fh); - BUG_ON(resp->cstate.replay_owner); + cstate->status = status; + fh_put(current_fh); + fh_put(save_fh); + BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ rqstp->rq_usedeferral = 1; @@ -1523,7 +1533,8 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); + return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * + sizeof(__be32); } static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d5d070fbeb35..3ba65979a3cd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1538,7 +1538,7 @@ out_err: } /* - * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. + * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) @@ -1596,7 +1596,7 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, * The sequence operation is not cached because we can use the slot and * session values. */ -__be32 +static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { @@ -1605,9 +1605,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, dprintk("--> %s slot %p\n", __func__, slot); - /* Either returns 0 or nfserr_retry_uncached */ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); - if (status == nfserr_retry_uncached_rep) + if (status) return status; /* The sequence operation has been encoded, cstate->datap set. */ @@ -2287,7 +2286,8 @@ out: if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; out_no_session: - kfree(conn); + if (conn) + free_conn(conn); spin_unlock(&nn->client_lock); return status; out_put_session: @@ -3627,8 +3627,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, nn, &cl); - if (status == nfserr_stale_clientid) + if (status == nfserr_stale_clientid) { + if (sessions) + return nfserr_bad_stateid; return nfserr_stale_stateid; + } if (status) return status; *s = find_stateid_by_type(cl, stateid, typemask); @@ -5062,7 +5065,6 @@ nfs4_state_destroy_net(struct net *net) int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct rb_node *node, *tmp; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { @@ -5071,13 +5073,11 @@ nfs4_state_destroy_net(struct net *net) } } - node = rb_first(&nn->unconf_name_tree); - while (node != NULL) { - tmp = node; - node = rb_next(tmp); - clp = rb_entry(tmp, struct nfs4_client, cl_namenode); - rb_erase(tmp, &nn->unconf_name_tree); - destroy_client(clp); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->unconf_id_hashtbl[i])) { + clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } } kfree(nn->sessionid_hashtbl); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 63f2395c57ed..2723c1badd01 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -294,7 +294,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ32(nace); if (nace > NFS4_ACL_MAX) - return nfserr_resource; + return nfserr_fbig; *acl = nfs4_acl_new(nace); if (*acl == NULL) @@ -1222,7 +1222,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) } write->wr_head.iov_base = p; write->wr_head.iov_len = avail; - WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); write->wr_pagelist = argp->pagelist; len = XDR_QUADLEN(write->wr_buflen) << 2; @@ -2483,6 +2482,8 @@ out_acl: goto out; } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + if ((buflen -= 16) < 0) + goto out_resource; WRITE32(3); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); @@ -2499,8 +2500,10 @@ out: security_release_secctx(context, contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); - if (tempfh) + if (tempfh) { fh_put(tempfh); + kfree(tempfh); + } return status; out_nfserr: status = nfserrno(err); @@ -3471,6 +3474,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; + if (nfserr) + return nfserr; + RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); *p++ = htonl(test_stateid->ts_num_ids); @@ -3579,8 +3585,6 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) return 0; session = resp->cstate.session; - if (session == NULL) - return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3620,9 +3624,17 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); - /* nfsd4_check_drc_limit guarantees enough room for error status */ + /* nfsd4_check_resp_size guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); + if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { + struct nfsd4_slot *slot = resp->cstate.slot; + + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) + op->status = nfserr_rep_too_big_to_cache; + else + op->status = nfserr_rep_too_big; + } if (so) { so->so_replay.rp_status = op->status; so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); @@ -3691,6 +3703,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) { + if (rqstp->rq_arg.head[0].iov_len % 4) { + /* client is nuts */ + dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", + __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); + return 0; + } args->p = p; args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7f555179bf81..f34d9de802ab 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net) if (err != 0 || fd < 0) return -EINVAL; + if (svc_alien_sock(net, fd)) { + printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); + return -EINVAL; + } + err = nfsd_create_serv(net); if (err != 0) return err; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 30f34ab02137..479eb681c27c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -282,7 +282,7 @@ void nfsd_lockd_shutdown(void); * reason. */ #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ -#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ +#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4775bc4896c8..ad67964d0bb1 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -133,6 +133,17 @@ fh_init(struct svc_fh *fhp, int maxsize) #ifdef CONFIG_NFSD_V3 /* + * The wcc data stored in current_fh should be cleared + * between compound ops. + */ +static inline void +fh_clear_wcc(struct svc_fh *fhp) +{ + fhp->fh_post_saved = 0; + fhp->fh_pre_saved = 0; +} + +/* * Fill in the pre_op attr for the wcc data */ static inline void @@ -152,7 +163,8 @@ fill_pre_wcc(struct svc_fh *fhp) extern void fill_post_wcc(struct svc_fh *); #else -#define fill_pre_wcc(ignored) +#define fh_clear_wcc(ignored) +#define fill_pre_wcc(ignored) #define fill_post_wcc(notused) #endif /* CONFIG_NFSD_V3 */ diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index b17d93214d01..9c769a47ac5a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, type = (stat->mode & S_IFMT); *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) (stat->mode & S_IALLUGO)); + *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 915808b36df7..16f0673a423c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -404,6 +404,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, umode_t ftype = 0; __be32 err; int host_err; + bool get_write_count; int size_change = 0; if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) @@ -411,10 +412,18 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; + /* Callers that do fh_verify should do the fh_want_write: */ + get_write_count = !fhp->fh_dentry; + /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) goto out; + if (get_write_count) { + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + } dentry = fhp->fh_dentry; inode = dentry->d_inode; @@ -1706,10 +1715,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, dput(odentry); out_nfserr: err = nfserrno(host_err); - - /* we cannot reply on fh_unlock on the two filehandles, + /* + * We cannot rely on fh_unlock on the two filehandles, * as that would do the wrong thing if the two directories - * were the same, so again we do it by hand + * were the same, so again we do it by hand. */ fill_post_wcc(ffhp); fill_post_wcc(tfhp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d278a0d03496..5ea7df305083 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,8 +574,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid_confirm *setclientid_confirm); extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); -extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, - struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 08fdb77852ac..f3a82fbcae02 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index 807150e2c2b9..dd6103cc93c1 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -18,16 +18,9 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "debug.h" -/* - * A static buffer to hold the error string being displayed and a spinlock - * to protect concurrent accesses to it. - */ -static char err_buf[1024]; -static DEFINE_SPINLOCK(err_buf_lock); - /** * __ntfs_warning - output a warning to the syslog * @function: name of function outputting the warning @@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock); void __ntfs_warning(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } /** @@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb, void __ntfs_error(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs error: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } #ifdef DEBUG @@ -124,6 +115,7 @@ int debug_msgs = 0; void __ntfs_debug (const char *file, int line, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function, return; if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); va_end(args); - printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line, - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); } /* Dump a runlist. Caller has to provide synchronisation for @rl. */ @@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (!debug_msgs) return; - printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); + pr_debug("Dumping runlist (values in hex):\n"); if (!rl) { - printk(KERN_DEBUG "Run list not present.\n"); + pr_debug("Run list not present.\n"); return; } - printk(KERN_DEBUG "VCN LCN Run length\n"); + pr_debug("VCN LCN Run length\n"); for (i = 0; ; i++) { LCN lcn = (rl + i)->lcn; @@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (index > -LCN_ENOENT - 1) index = 3; - printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", + pr_debug("%-16Lx %s %-16Lx%s\n", (long long)(rl + i)->vcn, lcn_str[index], (long long)(rl + i)->length, (rl + i)->length ? "" : " (runlist end)"); } else - printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", (long long)(rl + i)->vcn, (long long)(rl + i)->lcn, (long long)(rl + i)->length, diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 53c27eaf2307..61bf091e32a8 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #else /* !DEBUG */ -#define ntfs_debug(f, a...) do {} while (0) +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + #define ntfs_debug_dump_runlist(rl) do {} while (0) #endif /* !DEBUG */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9d8153ebacfb..f47af5e6e230 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; /* diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index bd5610d48242..9de2491f2926 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -19,6 +19,7 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/stddef.h> #include <linux/init.h> @@ -1896,7 +1897,7 @@ get_ctx_vol_failed: vol->minor_ver = vi->minor_ver; ntfs_attr_put_search_ctx(ctx); unmap_mft_record(NTFS_I(vol->vol_ino)); - printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, + pr_info("volume version %i.%i.\n", vol->major_ver, vol->minor_ver); if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " @@ -3095,7 +3096,7 @@ static int __init init_ntfs_fs(void) int err = 0; /* This may be ugly but it results in pretty output so who cares. (-8 */ - printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" + pr_info("driver " NTFS_VERSION " [Flags: R/" #ifdef NTFS_RW "W" #else @@ -3115,16 +3116,15 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_index_context), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_index_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_index_ctx_cache_name); + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); goto ictx_err_out; } ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, sizeof(ntfs_attr_search_ctx), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_attr_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); goto actx_err_out; } @@ -3132,8 +3132,7 @@ static int __init init_ntfs_fs(void) (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ntfs_name_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_name_cache_name); + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); goto name_err_out; } @@ -3141,8 +3140,7 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_inode), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!ntfs_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); goto inode_err_out; } @@ -3151,15 +3149,14 @@ static int __init init_ntfs_fs(void) SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_big_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; } /* Register the ntfs sysctls. */ err = ntfs_sysctl(1); if (err) { - printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); + pr_crit("Failed to register NTFS sysctls!\n"); goto sysctl_err_out; } @@ -3168,7 +3165,7 @@ static int __init init_ntfs_fs(void) ntfs_debug("NTFS driver registered successfully."); return 0; /* Success! */ } - printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); + pr_crit("Failed to register NTFS filesystem driver!\n"); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); @@ -3184,8 +3181,7 @@ actx_err_out: kmem_cache_destroy(ntfs_index_ctx_cache); ictx_err_out: if (!err) { - printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " - "registration...\n"); + pr_crit("Aborting NTFS filesystem driver registration...\n"); err = -ENOMEM; } return err; diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e1..b7f57271d49c 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eb649d23a4de..c6b90e670389 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) } read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } @@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work) cond_resched(); } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; @@ -1978,7 +1951,6 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) */ if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); } else { ready = NULL; @@ -1987,7 +1959,7 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); if (ready != NULL) - ready(sk, bytes); + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a3..dc024367110a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ff33c5ef87f2..8970dcf74de5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2367,15 +2367,18 @@ relock: if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + count, ocount); if (written < 0) { ret = written; goto out_dio; } } else { + struct iov_iter from; + iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + written = generic_perform_write(file, &from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5c8343fe7438..83f1a665ae97 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_max_locking_protocol = - __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = - __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = - __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -599,7 +599,7 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, static struct kobj_attribute ocfs2_attr_cluster_stack = - __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); diff --git a/fs/open.c b/fs/open.c index 7b823daa6a93..9d64679cec73 100644 --- a/fs/open.c +++ b/fs/open.c @@ -652,35 +652,6 @@ out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) -{ - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } - return error; -} - int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -705,26 +676,28 @@ static int do_dentry_open(struct file *f, f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, f->f_path.mnt); - if (error) - goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); - } - f->f_mapping = inode->i_mapping; - if (unlikely(f->f_mode & FMODE_PATH)) { + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; f->f_op = &empty_fops; return 0; } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) + goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; + } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; @@ -761,18 +734,9 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITER) { put_write_access(inode); - if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ - file_reset_write(f); - __mnt_drop_write(f->f_path.mnt); - } + __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); diff --git a/fs/pipe.c b/fs/pipe.c index 78fd0d0788db..034bffac3f97 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, return 0; } -static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_to_user_inatomic(iov->iov_base, from, copy)) - return -EFAULT; - } else { - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; - } - from += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Attempt to pre-fault in the user memory, so we can use atomic copies. - * Returns the number of bytes not faulted in. - */ -static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - if (fault_in_pages_writeable(iov->iov_base, this_len)) - break; - - len -= this_len; - iov++; - } - - return len; -} - /* * Pre-fault in the user memory, so we can use atomic copies. */ @@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } /** - * generic_pipe_buf_map - virtually map a pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be mapped - * @atomic: whether to use an atomic map - * - * Description: - * This function returns a kernel virtual address mapping for the - * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided - * and the caller has to be careful not to fault before calling - * the unmap function. - * - * Note that this function calls kmap_atomic() if @atomic != 0. - */ -void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, int atomic) -{ - if (atomic) { - buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page); - } - - return kmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_map); - -/** - * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be unmapped - * @map_data: the data that the mapping function returned - * - * Description: - * This function undoes the mapping that ->map() provided. - */ -void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, void *map_data) -{ - if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { - buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data); - } else - kunmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_unmap); - -/** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal @@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, ssize_t ret; struct iovec *iov = (struct iovec *)_iov; size_t total_len; + struct iov_iter iter; total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; + iov_iter_init(&iter, iov, nr_segs, total_len, 0); + do_wakeup = 0; ret = 0; __pipe_lock(pipe); @@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; - void *addr; size_t chars = buf->len; - int error, atomic; + size_t written; + int error; if (chars > total_len) chars = total_len; @@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - atomic = !iov_fault_in_pages_write(iov, chars); -redo: - addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - ops->unmap(pipe, buf, addr); - if (unlikely(error)) { - /* - * Just retry with the slow path if we failed. - */ - if (atomic) { - atomic = 0; - goto redo; - } + written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); + if (unlikely(written < chars)) { if (!ret) - ret = error; + ret = -EFAULT; break; } ret += chars; @@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, iov_fault_in_pages_read(iov, chars); redo1: - addr = ops->map(pipe, buf, atomic); + if (atomic) + addr = kmap_atomic(buf->page); + else + addr = kmap(buf->page); error = pipe_iov_copy_from_user(offset + addr, iov, chars, atomic); - ops->unmap(pipe, buf, addr); + if (atomic) + kunmap_atomic(addr); + else + kunmap(buf->page); ret = error; do_wakeup = 1; if (error) { diff --git a/fs/pnode.c b/fs/pnode.c index 88396df725b4..302bf22c4a30 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m, } } -/* - * return the source mount to be used for cloning - * - * @dest the current destination mount - * @last_dest the last seen destination mount - * @last_src the last seen source mount - * @type return CL_SLAVE if the new mount has to be - * cloned as a slave. - */ -static struct mount *get_source(struct mount *dest, - struct mount *last_dest, - struct mount *last_src, - int *type) +static struct mount *next_group(struct mount *m, struct mount *origin) { - struct mount *p_last_src = NULL; - struct mount *p_last_dest = NULL; - - while (last_dest != dest->mnt_master) { - p_last_dest = last_dest; - p_last_src = last_src; - last_dest = last_dest->mnt_master; - last_src = last_src->mnt_master; + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; + } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; } +} - if (p_last_dest) { - do { - p_last_dest = next_peer(p_last_dest); - } while (IS_MNT_NEW(p_last_dest)); - /* is that a peer of the earlier? */ - if (dest == p_last_dest) { - *type = CL_MAKE_SHARED; - return p_last_src; +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; } - /* slave of the earlier, then */ - *type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; - return last_src; + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); + } + hlist_add_head(&child->mnt_hash, list); + return 0; } /* @@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mount *m, *child; + struct mount *m, *n; int ret = 0; - struct mount *prev_dest_mnt = dest_mnt; - struct mount *prev_src_mnt = source_mnt; - HLIST_HEAD(tmp_list); - - for (m = propagation_next(dest_mnt, dest_mnt); m; - m = propagation_next(m, dest_mnt)) { - int type; - struct mount *source; - - if (IS_MNT_NEW(m)) - continue; - - source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; - - child = copy_tree(source, source->mnt.mnt_root, type); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - tmp_list = *tree_list; - tmp_list.first->pprev = &tmp_list.first; - INIT_HLIST_HEAD(tree_list); + + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; + + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) goto out; - } + } - if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_mp, child); - hlist_add_head(&child->mnt_hash, tree_list); - } else { - /* - * This can happen if the parent mount was bind mounted - * on some subdirectory of a shared/slave mount. - */ - hlist_add_head(&child->mnt_hash, &tmp_list); - } - prev_dest_mnt = m; - prev_src_mnt = child; + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); } out: - lock_mount_hash(); - while (!hlist_empty(&tmp_list)) { - child = hlist_entry(tmp_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); } - unlock_mount_hash(); + read_sequnlock_excl(&mount_lock); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index fc28a27fa892..4a246358b031 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -16,6 +16,9 @@ #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/fs/proc/array.c b/fs/proc/array.c index 656e401794de..64db2bceac59 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,8 +138,8 @@ static const char * const task_state_array[] = { "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/fs/proc/base.c b/fs/proc/base.c index b9760628e1fd..2d696b0c93bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char * buffer) +static int proc_pid_cmdline(struct task_struct *task, char *buffer) { - int res = 0; - unsigned int len; - struct mm_struct *mm = get_task_mm(task); - if (!mm) - goto out; - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - - res = access_process_vm(task, mm->arg_start, buffer, len, 0); - - // If the nul at the end of args has been overwritten, then - // assume application is using setproctitle(3). - if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { - len = strnlen(buffer, res); - if (len < res) { - res = len; - } else { - len = mm->env_end - mm->env_start; - if (len > PAGE_SIZE - res) - len = PAGE_SIZE - res; - res += access_process_vm(task, mm->env_start, buffer+res, len, 0); - res = strnlen(buffer, res); - } - } -out_mm: - mmput(mm); -out: - return res; + return get_cmdline(task, buffer, PAGE_SIZE); } static int proc_pid_auxv(struct task_struct *task, char *buffer) @@ -1236,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -2588,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), @@ -2598,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), @@ -2617,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2626,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -2927,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), @@ -2955,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2964,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 985ea881b5bc..0788d093f5d8 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> +#include "../mount.h" #include "internal.h" #include "fd.h" @@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v) } if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", - (long long)file->f_pos, f_flags); + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) ret = file->f_op->show_fdinfo(m, file); fput(file); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8f20e3404fd2..0adbc02d60e3 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode) pde_put(de); head = PROC_I(inode)->sysctl; if (head) { - rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } /* Release any associated namespace */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 136e548d9567..7445af0b1aa3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) available += pagecache; /* - * Part of the reclaimable swap consists of items that are in use, + * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ available += global_page_state(NR_SLAB_RECLAIMABLE) - diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 9ae46b87470d..89026095f2b5 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec942..4348bb8907c2 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fb52b548080d..442177b1119a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 88d4585b30f1..6a8e785b29da 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } @@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } @@ -1118,4 +1116,3 @@ void vmcore_cleanup(void) } free_elfcorebuf(); } -EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7be26f03a3f5..1a81373947f3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->root = root; p->m.poll_event = ns->event; p->show = show; + p->cached_event = ~0ULL; return 0; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 880fd9884366..c51df1dd237e 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -8,9 +8,10 @@ config QUOTA help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the - ext2, ext3, and reiserfs file system. ext3 also supports journalled - quotas for which you don't need to run quotacheck(8) after an unclean - shutdown. + ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems. + Note that gfs2 and xfs use their own quota system. + Ext3, ext4 and reiserfs also support journaled quotas for which + you don't need to run quotacheck(8) after an unclean shutdown. For further details, read the Quota mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>, or the documentation provided with the quota tools. Probably the quota support is only useful for diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 1fd2051109a3..af677353a3f5 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) int d_reclen; char *d_name; ino_t d_ino; + loff_t cur_pos = deh_offset(deh); if (!de_visible(deh)) /* it is hidden entry */ @@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) if (local_buf != small_buf) { kfree(local_buf); } - // next entry should be looked for with such offset - next_pos = deh_offset(deh) + 1; + + /* deh_offset(deh) may be invalid now. */ + next_pos = cur_pos + 1; if (item_moved(&tmp_ih, &path_to_entry)) { set_cpu_key_k_offset(&pos_key, diff --git a/fs/splice.c b/fs/splice.c index 12028fa41def..9bc07d2b53cf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,8 +136,6 @@ error: const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, goto out; if (buf->page != page) { - char *src = buf->ops->map(pipe, buf, 1); + char *src = kmap_atomic(buf->page); char *dst = kmap_atomic(page); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); } ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, page, fsdata); @@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *data; loff_t tmp = sd->pos; - data = buf->ops->map(pipe, buf, 0); + data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - buf->ops->unmap(pipe, buf, data); + kunmap(buf->page); return ret; } @@ -1528,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov, static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - char *src; - int ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - buf->ops->unmap(pipe, buf, src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = buf->ops->map(pipe, buf, 0); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - buf->ops->unmap(pipe, buf, src); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; - ssize_t size; - int error; long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count = 0; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - pipe_lock(pipe); - - error = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - error = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - error = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - size = __splice_from_pipe(pipe, &sd, pipe_to_user); - if (size < 0) { - if (!ret) - ret = size; - - break; - } - - ret += size; + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + return ret; - if (size < len) - break; + iov_iter_init(&iter, iov, nr_segs, count, 0); - nr_segs--; - iov++; - } + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); - if (!ret) - ret = error; + if (iov != iovstack) + kfree(iov); return ret; } diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe159..48377f7463c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fdb..28cc1acd5439 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -453,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 123c79b7261e..4f34dbae823d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1538,6 +1538,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index 1037637957c7..d2c170f8b035 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + retval = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (retval > 0) { diff --git a/fs/udf/super.c b/fs/udf/super.c index 64f2b7334d08..3286db047a40 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -175,7 +175,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), @@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; + unsigned n; if (!*p) continue; @@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_bs: if (match_int(&args[0], &option)) return 0; - uopt->blocksize = option; + n = option; + if (n != 512 && n != 1024 && n != 2048 && n != 4096) + return 0; + uopt->blocksize = n; uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7ea492ae660..0ab1de4b39a5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; @@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; @@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; @@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first (uspi); count = newcount - oldcount; cgno = ufs_dtog(uspi, fragment); @@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; @@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); oldcg = cgno; /* @@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; @@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal == 0) { @@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_super_block_first *usb1; struct ufs_cylinder_group *ucg; unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; @@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, (unsigned long long)goal, count); - usb1 = ubh_get_usb_first (uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index d0426d74817b..98f7211599ff 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; int is_directory; @@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ino = inode->i_ino; @@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct super_block * sb; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; @@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) ufsi = UFS_I(inode); sbi = UFS_SB(sb); uspi = sbi->s_uspi; - usb1 = ubh_get_usb_first(uspi); mutex_lock(&sbi->s_lock); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b8c6791f046f..c1183f9f69dc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb) struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; - struct ufs_super_block_third *usb3; UFSD("ENTER\n"); - usb3 = ubh_get_usb_third(uspi); /* * Read cs structures from (usually) first data block * on the device. @@ -1390,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_first *usb1; - struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_ufs(sb); - usb1 = ubh_get_usb_first(uspi); - usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1454,7 +1448,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1566,6 +1580,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1599,12 +1623,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1615,9 +1648,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1640,8 +1676,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3cb528c4f27c..951a2321ee01 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -699,7 +699,7 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + &nr_segs, pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); @@ -715,7 +715,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t count) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -724,7 +724,7 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + struct iov_iter from; xfs_rw_ilock(ip, iolock); @@ -732,14 +732,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_init(&from, iovp, nr_segs, count, 0); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); - + ret = generic_perform_write(file, &from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved @@ -1491,6 +1492,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcfe61202115..0b18776b075e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -271,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -334,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..ef1ca010f417 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1053,11 +1053,25 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; + int error; + struct xfs_inode *ip; + struct inode *inode; - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + if (unlikely(error)) + return -error; - return -error; + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) { + iput(inode); + return -error; + } + + d_tmpfile(dentry, inode); + + return 0; } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..08624dc67317 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1371,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1409,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1436,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1644,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1657,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1665,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1868,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL |