diff options
Diffstat (limited to 'fs')
68 files changed, 1016 insertions, 582 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d5c1401f0031..d34896cfb19f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, } } -static DEFINE_MUTEX(autofs4_ioctl_mutex); - static long autofs4_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - long ret; struct inode *inode = filp->f_dentry->d_inode; - - mutex_lock(&autofs4_ioctl_mutex); - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); - mutex_unlock(&autofs4_ioctl_mutex); - - return ret; + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT @@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp, struct inode *inode = filp->f_path.dentry->d_inode; int ret; - mutex_lock(&autofs4_ioctl_mutex); if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); else ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, (unsigned long)compat_ptr(arg)); - mutex_unlock(&autofs4_ioctl_mutex); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c547cca26a26..51d2e4de34eb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -696,6 +696,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +#ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, struct page *newpage, struct page *page) { @@ -712,12 +713,9 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; -#ifdef CONFIG_MIGRATION return migrate_page(mapping, newpage, page); -#else - return -ENOSYS; -#endif } +#endif static int btree_writepage(struct page *page, struct writeback_control *wbc) { @@ -1009,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - BUG_ON(!root->node); + if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + free_extent_buffer(root->node); + return -EIO; + } root->commit_root = btrfs_root_node(root); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bcd59c7dfb57..227e5815d838 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -429,6 +429,7 @@ err: static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_trans_handle *trans, + struct btrfs_root *root, int load_cache_only) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, /* * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. + * to have the normal tree locking. Also if we are currently trying to + * allocate blocks for the tree root we can't do the fast caching since + * we likely hold important locks. */ - if (!trans->transaction->in_commit) { + if (!trans->transaction->in_commit && + (root && root != root->fs_info->tree_root)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); @@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_root *root = block_group->fs_info->tree_root; struct inode *inode = NULL; u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; int num_pages = 0; int retries = 0; int ret = 0; @@ -2795,6 +2800,8 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED) { + /* We're not cached, don't bother trying to write stuff out */ + dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } @@ -2821,6 +2828,8 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); out_put: iput(inode); @@ -2828,10 +2837,7 @@ out_free: btrfs_release_path(root, path); out: spin_lock(&block_group->lock); - if (ret) - block_group->disk_cache_state = BTRFS_DC_ERROR; - else - block_group->disk_cache_state = BTRFS_DC_SETUP; + block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); return ret; @@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + u64 num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); @@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, 1); + cache_block_group(cache, trans, NULL, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4930,11 +4942,31 @@ search: btrfs_get_block_group(block_group); search_start = block_group->key.objectid; + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, data)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((data & extra) && !(block_group->flags & extra)) + goto loop; + } + have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { u64 free_percent; - ret = cache_block_group(block_group, trans, 1); + ret = cache_block_group(block_group, trans, + orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) goto have_block_group; @@ -4958,7 +4990,8 @@ have_block_group: if (loop > LOOP_CACHING_NOWAIT || (loop > LOOP_FIND_IDEAL && atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group, trans, 0); + ret = cache_block_group(block_group, trans, + orig_root, 0); BUG_ON(ret); } found_uncached_bg = true; @@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, 0); + cache_block_group(block_group, trans, NULL, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, NULL, NULL); BUG_ON(ret < 0); if (ret > 0) { - ret = btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - BUG_ON(ret); + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); } } @@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; - leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); cache = kzalloc(sizeof(*cache), GFP_NOFS); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c1faded5fca0..66836d85763b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page **prepared_pages, struct iov_iter *i) { - size_t copied; + size_t copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); + int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; -again: - if (unlikely(iov_iter_fault_in_readable(i, count))) - return -EFAULT; - - /* Copy data from userspace to the current page */ - copied = iov_iter_copy_from_user(page, i, offset, count); + /* + * Copy data from userspace to the current page + * + * Disable pagefault to avoid recursive lock since + * the pages are already locked + */ + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); iov_iter_advance(i, copied); write_bytes -= copied; + total_copied += copied; + /* Return to btrfs_file_aio_write to fault page */ if (unlikely(copied == 0)) { - count = min_t(size_t, PAGE_CACHE_SIZE - offset, - iov_iter_single_seg_count(i)); - goto again; + break; } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { @@ -81,7 +85,7 @@ again: offset = 0; } } - return 0; + return total_copied; } /* @@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, unsigned long last_index; int will_write; int buffered = 0; + int copied = 0; + int dirty_pages = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); @@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_delalloc_reserve_space(inode, write_bytes); + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { + ret = -EFAULT; + goto out; + } + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); if (ret) goto out; @@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); + btrfs_delalloc_release_space(inode, + num_pages << PAGE_CACHE_SHIFT); goto out; } - ret = btrfs_copy_from_user(pos, num_pages, + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - if (ret == 0) { + dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (num_pages > dirty_pages) { + if (copied > 0) + atomic_inc( + &BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT); + } + + if (copied > 0) { dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); + dirty_pages, pos, copied); } btrfs_drop_pages(pages, num_pages); - if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); - goto out; - } - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1); - } else { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - num_pages); - if (num_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); + if (copied > 0) { + if (will_write) { + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + copied - 1); + } else { + balance_dirty_pages_ratelimited_nr( + inode->i_mapping, + dirty_pages); + if (dirty_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } } - pos += write_bytes; - num_written += write_bytes; + pos += copied; + num_written += copied; cond_resched(); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 22ee0dc2e6b8..60d684266959 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation, (unsigned long long)block_group->key.objectid); - goto out; + goto free_cache; } if (!num_entries) @@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, return 0; } + node = rb_first(&block_group->free_space_offset); + if (!node) { + iput(inode); + return 0; + } + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & @@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, */ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - node = rb_first(&block_group->free_space_offset); - if (!node) - goto out_free; - /* * Lock all pages first so we can lock the extent safely. * diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8039390bd6a6..72f31ecb5c90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -495,7 +495,7 @@ again: add_async_extent(async_cow, start, num_bytes, total_compressed, pages, nr_pages_ret); - if (start + num_bytes < end && start + num_bytes < actual_end) { + if (start + num_bytes < end) { start += num_bytes; pages = NULL; cond_resched(); @@ -5712,9 +5712,9 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (err) { printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " - "disk_bytenr %lu len %u err no %d\n", - dip->inode->i_ino, bio->bi_rw, bio->bi_sector, - bio->bi_size, err); + "sector %#Lx len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, + (unsigned long long)bio->bi_sector, bio->bi_size, err); dip->errors = 1; /* @@ -5934,8 +5934,7 @@ free_ordered: */ if (write) { struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, - dip->logical_offset); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f1c9bb4079ed..f87552a1d7ea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -947,23 +947,42 @@ out: static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol, - int async) + int v2) { struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; char *name; u64 fd; - u64 transid = 0; int ret; - if (async) { - async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); - if (IS_ERR(async_vol_args)) - return PTR_ERR(async_vol_args); + if (v2) { + u64 transid = 0; + u64 *ptr = NULL; - name = async_vol_args->name; - fd = async_vol_args->fd; - async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; + vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); + if (IS_ERR(vol_args_v2)) + return PTR_ERR(vol_args_v2); + + if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out; + } + + name = vol_args_v2->name; + fd = vol_args_v2->fd; + vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, ptr); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; } else { vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) @@ -971,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, name = vol_args->name; fd = vol_args->fd; vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - } - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, &transid); - if (!ret && async) { - if (copy_to_user(arg + - offsetof(struct btrfs_ioctl_async_vol_args, - transid), &transid, sizeof(transid))) - return -EFAULT; + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, NULL); } - +out: kfree(vol_args); - kfree(async_vol_args); + kfree(vol_args_v2); return ret; } @@ -2246,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0, 0); - case BTRFS_IOC_SNAP_CREATE_ASYNC: + case BTRFS_IOC_SNAP_CREATE_V2: return btrfs_ioctl_snap_create(file, argp, 0, 1); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1, 0); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 17c99ebdf960..c344d12c646b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; -#define BTRFS_SNAPSHOT_NAME_MAX 4079 -struct btrfs_ioctl_async_vol_args { +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) + +#define BTRFS_SUBVOL_NAME_MAX 4039 +struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; - char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; + __u64 flags; + __u64 unused[4]; + char name[BTRFS_SUBVOL_NAME_MAX + 1]; }; #define BTRFS_INO_LOOKUP_PATH_MAX 4080 @@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_args) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_async_vol_args) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 79cba5fbc28e..f8be250963a0 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret < 0) goto out; + if (ret) { + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dbb51ea7a13c..883c6fa1367e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -685,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, mutex_unlock(&root->d_inode->i_mutex); if (IS_ERR(new_root)) { + dput(root); deactivate_locked_super(s); error = PTR_ERR(new_root); - dput(root); goto error_free_subvol_name; } if (!new_root->d_inode) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc04dc1445d6..6b9884507837 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (strcmp(device->name, path)) { + } else if (!device->name || strcmp(device->name, path)) { name = kstrdup(path, GFP_NOFS); if (!name) return -ENOMEM; kfree(device->name); device->name = name; + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } } if (found_transid > fs_devices->latest_trans) { @@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->num_devices--; + if (device->missing) + root->fs_info->fs_devices->missing_devices--; + next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); if (device->bdev == root->fs_info->sb->s_bdev) @@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, device->devid = devid; device->work.func = pending_bios_fn; device->fs_devices = fs_devices; + device->missing = 1; fs_devices->num_devices++; + fs_devices->missing_devices++; spin_lock_init(&device->io_lock); INIT_LIST_HEAD(&device->dev_alloc_list); memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); @@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root, device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; + } else if (!device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + root->fs_info->fs_devices->missing_devices++; + device->missing = 1; } } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2b638b6e4eea..2740db49eb04 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -44,6 +44,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; + int missing; spinlock_t io_lock; @@ -93,6 +94,7 @@ struct btrfs_fs_devices { u64 num_devices; u64 open_devices; u64 rw_devices; + u64 missing_devices; u64 total_rw_bytes; struct block_device *latest_bdev; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 7d447af84ec4..158c700fdca5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -114,8 +114,8 @@ static int __dcache_readdir(struct file *filp, spin_lock(&dcache_lock); /* start at beginning? */ - if (filp->f_pos == 2 || (last && - filp->f_pos < ceph_dentry(last)->offset)) { + if (filp->f_pos == 2 || last == NULL || + filp->f_pos < ceph_dentry(last)->offset) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index a6ce54e94eb5..52e8fd74d450 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include <linux/ioctl.h> #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x98 +#define CEPH_IOCTL_MAGIC 0x97 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 40abde93c345..476b329867d4 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -11,40 +11,68 @@ * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, - u64 pid, u64 pid_ns, - int cmd, u64 start, u64 length, u8 wait) + int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; + u64 length = 0; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = igrab(inode); + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " "length: %llu, wait: %d, type`: %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd); + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type); + req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; - req->r_args.filelock_change.pid = cpu_to_le64(pid); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); /* This should be adjusted, but I'm not sure if namespaces actually get id numbers*/ req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)pid_ns); - req->r_args.filelock_change.start = cpu_to_le64(start); + cpu_to_le64((u64)(unsigned long)fl->fl_nspid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); + + if ( operation == CEPH_MDS_OP_GETFILELOCK){ + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd, err); + "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); return err; } @@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 0; @@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; - - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - dout("mds locked, locking locally"); - err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); - dout("got %d on posix_lock_file, undid lock", err); + if ( op != CEPH_MDS_OP_GETFILELOCK ){ + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if the kernel detects + * local deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", err); + } } + } else { dout("mds returned error code %d", err); } @@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 1; @@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - /* mds requires start and length rather than start and end */ - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + file, lock_cmd, wait, fl); if (!err) { err = flock_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); + file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } } else { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 098b18508479..38800eaa81d0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -202,6 +202,38 @@ out_bad: } /* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info); + else + return parse_reply_info_dir(p, end, info); +} + +/* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, @@ -223,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg, goto out_bad; } - /* dir content */ + /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_dir(&p, p+len, info); + err = parse_reply_info_extra(&p, p+len, info); if (err < 0) goto out_bad; } @@ -2074,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } @@ -2094,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && rinfo->dir_nr) + if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 9341fd4f1432..aabe563b54db 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in { }; /* - * parsed info about an mds reply, including information about the - * target inode and/or its parent directory and dentry, and directory - * contents (for readdir results). + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; + /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; - struct ceph_mds_reply_dirfrag *dir_dir; - int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 0ed213970ced..ee45648b0d1a 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -4,6 +4,7 @@ config CIFS select NLS select CRYPTO select CRYPTO_MD5 + select CRYPTO_HMAC select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System @@ -143,6 +144,13 @@ config CIFS_FSCACHE to be cached locally on disk through the general filesystem cache manager. If unsure, say N. +config CIFS_ACL + bool "Provide CIFS ACL support (EXPERIMENTAL)" + depends on EXPERIMENTAL && CIFS_XATTR + help + Allows to fetch CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index adefa60a9bdc..43b19dd39191 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o cifsacl.o + readdir.o ioctl.o sess.o export.o + +cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/README b/fs/cifs/README index ee68d1036544..46af99ab3614 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -337,6 +337,15 @@ A partial list of the supported mount options follows: wsize default write size (default 57344) maximum wsize currently allowed by CIFS is 57344 (fourteen 4096 byte pages) + actimeo=n attribute cache timeout in seconds (default 1 second). + After this timeout, the cifs client requests fresh attribute + information from the server. This option allows to tune the + attribute cache timeout to suit the workload needs. Shorter + timeouts mean better the cache coherency, but increased number + of calls to the server. Longer timeouts mean reduced number + of calls to the server at the expense of less stricter cache + coherency checks (i.e. incorrect attribute cache for a short + period of time). rw mount the network share read-write (note that the server may still consider the share read-only) ro mount network share read-only diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index e9a393c9c2ca..7852cd677051 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -48,6 +48,7 @@ struct cifs_sb_info { struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ atomic_t active; uid_t mnt_uid; gid_t mnt_gid; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index c9b4792ae825..a437ec391a01 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -30,8 +30,6 @@ #include "cifs_debug.h" -#ifdef CONFIG_CIFS_EXPERIMENTAL - static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, @@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); xid = GetXid(); rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); @@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } @@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = GetXid(); @@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc) { - cERROR(1, "Unable to open file to get ACL"); - goto out; + if (!rc) { + rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid); } - rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); - - CIFSSMBClose(xid, tcon, fid); - out: cifs_put_tlink(tlink); FreeXid(xid); + + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } /* Retrieve an ACL from the server */ -static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, +struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen) { @@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ -void +int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid) { @@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ - if (pntsd) + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { rc = parse_sec_desc(pntsd, acllen, fattr); - if (rc) - cFYI(1, "parse sec desc failed rc = %d", rc); + kfree(pntsd); + if (rc) + cERROR(1, "parse sec desc failed rc = %d", rc); + } - kfree(pntsd); - return; + return rc; } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) +int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) { int rc = 0; __u32 secdesclen = 0; @@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) /* Add three ACEs for owner, group, everyone getting rid of other ACEs as chmod disables ACEs and set the security descriptor */ - if (pntsd) { + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { /* allocate memory for the smb header, set security descriptor request security descriptor parameters, and secuirty descriptor itself */ @@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) return rc; } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 6c8096cf5155..c4ae7d036563 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -74,11 +74,7 @@ struct cifs_wksid { char sidname[SIDNAMELENGTH]; } __attribute__((packed)); -#ifdef CONFIG_CIFS_EXPERIMENTAL - extern int match_sid(struct cifs_sid *); extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); -#endif /* CONFIG_CIFS_EXPERIMENTAL */ - #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9c3789762ab7..3936aa7f2c22 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -458,9 +458,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_printf(s, ",mfsymlinks"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + seq_printf(s, ",fsc"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); + /* convert actimeo and display it in seconds */ + seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); return 0; } @@ -933,7 +937,6 @@ init_cifs(void) GlobalCurrentXid = 0; GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; - memset(Local_System_Name, 0, 15); spin_lock_init(&cifs_tcp_ses_lock); spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b577bf0a1bb3..7136c0c3e2f9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -45,6 +45,16 @@ #define CIFS_MIN_RCV_POOL 4 /* + * default attribute cache timeout (jiffies) + */ +#define CIFS_DEF_ACTIMEO (1 * HZ) + +/* + * max attribute cache timeout (jiffies) - 2^30 + */ +#define CIFS_MAX_ACTIMEO (1 << 30) + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. It also matches the most common * value of max multiplex returned by servers. We may @@ -746,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ /* on midQ entries */ -GLOBAL_EXTERN char Local_System_Name[15]; - /* * Global counters, updated atomically */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 7ed69b6b5fe6..e6d1481b16c1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -54,7 +54,8 @@ do { \ __func__, curr_xid, (int)rc); \ } while (0) extern char *build_path_from_dentry(struct dentry *); -extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); +extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, @@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); -#ifdef CONFIG_CIFS_EXPERIMENTAL extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -#endif extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, @@ -130,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, int xid); -extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, +extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_acl(struct inode *inode, const char *path, __u64); +extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, + const char *, u32 *); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, const char *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 2f2632b6df5a..67acfb3acad2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2478,95 +2478,6 @@ querySymLinkRetry: } #ifdef CONFIG_CIFS_EXPERIMENTAL -/* Initialize NT TRANSACT SMB into small smb request buffer. - This assumes that all NT TRANSACTS that we init here have - total parm and data under about 400 bytes (to fit in small cifs - buffer size), which is the case so far, it easily fits. NB: - Setup words themselves and ByteCount - MaxSetupCount (size of returned setup area) and - MaxParameterCount (returned parms size) must be set by caller */ -static int -smb_init_nttransact(const __u16 sub_command, const int setup_count, - const int parm_len, struct cifsTconInfo *tcon, - void **ret_buf) -{ - int rc; - __u32 temp_offset; - struct smb_com_ntransact_req *pSMB; - - rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, - (void **)&pSMB); - if (rc) - return rc; - *ret_buf = (void *)pSMB; - pSMB->Reserved = 0; - pSMB->TotalParameterCount = cpu_to_le32(parm_len); - pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->DataCount = pSMB->TotalDataCount; - temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + - (setup_count * 2) - 4 /* for rfc1001 length itself */; - pSMB->ParameterOffset = cpu_to_le32(temp_offset); - pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); - pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ - pSMB->SubCommand = cpu_to_le16(sub_command); - return 0; -} - -static int -validate_ntransact(char *buf, char **ppparm, char **ppdata, - __u32 *pparmlen, __u32 *pdatalen) -{ - char *end_of_smb; - __u32 data_count, data_offset, parm_count, parm_offset; - struct smb_com_ntransact_rsp *pSMBr; - - *pdatalen = 0; - *pparmlen = 0; - - if (buf == NULL) - return -EINVAL; - - pSMBr = (struct smb_com_ntransact_rsp *)buf; - - /* ByteCount was converted from little endian in SendReceive */ - end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + - (char *)&pSMBr->ByteCount; - - data_offset = le32_to_cpu(pSMBr->DataOffset); - data_count = le32_to_cpu(pSMBr->DataCount); - parm_offset = le32_to_cpu(pSMBr->ParameterOffset); - parm_count = le32_to_cpu(pSMBr->ParameterCount); - - *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; - *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; - - /* should we also check that parm and data areas do not overlap? */ - if (*ppparm > end_of_smb) { - cFYI(1, "parms start after end of smb"); - return -EINVAL; - } else if (parm_count + *ppparm > end_of_smb) { - cFYI(1, "parm end after end of smb"); - return -EINVAL; - } else if (*ppdata > end_of_smb) { - cFYI(1, "data starts after end of smb"); - return -EINVAL; - } else if (data_count + *ppdata > end_of_smb) { - cFYI(1, "data %p + count %d (%p) past smb end %p start %p", - *ppdata, data_count, (data_count + *ppdata), - end_of_smb, pSMBr); - return -EINVAL; - } else if (parm_count + data_count > pSMBr->ByteCount) { - cFYI(1, "parm count and data count larger than SMB"); - return -EINVAL; - } - *pdatalen = data_count; - *pparmlen = parm_count; - return 0; -} - int CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, @@ -3056,7 +2967,97 @@ GetExtAttrOut: #endif /* CONFIG_POSIX */ -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL +/* + * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that + * all NT TRANSACTS that we init here have total parm and data under about 400 + * bytes (to fit in small cifs buffer size), which is the case so far, it + * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of + * returned setup area) and MaxParameterCount (returned parms size) must be set + * by caller + */ +static int +smb_init_nttransact(const __u16 sub_command, const int setup_count, + const int parm_len, struct cifsTconInfo *tcon, + void **ret_buf) +{ + int rc; + __u32 temp_offset; + struct smb_com_ntransact_req *pSMB; + + rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, + (void **)&pSMB); + if (rc) + return rc; + *ret_buf = (void *)pSMB; + pSMB->Reserved = 0; + pSMB->TotalParameterCount = cpu_to_le32(parm_len); + pSMB->TotalDataCount = 0; + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->DataCount = pSMB->TotalDataCount; + temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + + (setup_count * 2) - 4 /* for rfc1001 length itself */; + pSMB->ParameterOffset = cpu_to_le32(temp_offset); + pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); + pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ + pSMB->SubCommand = cpu_to_le16(sub_command); + return 0; +} + +static int +validate_ntransact(char *buf, char **ppparm, char **ppdata, + __u32 *pparmlen, __u32 *pdatalen) +{ + char *end_of_smb; + __u32 data_count, data_offset, parm_count, parm_offset; + struct smb_com_ntransact_rsp *pSMBr; + + *pdatalen = 0; + *pparmlen = 0; + + if (buf == NULL) + return -EINVAL; + + pSMBr = (struct smb_com_ntransact_rsp *)buf; + + /* ByteCount was converted from little endian in SendReceive */ + end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + + (char *)&pSMBr->ByteCount; + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + parm_offset = le32_to_cpu(pSMBr->ParameterOffset); + parm_count = le32_to_cpu(pSMBr->ParameterCount); + + *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; + *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; + + /* should we also check that parm and data areas do not overlap? */ + if (*ppparm > end_of_smb) { + cFYI(1, "parms start after end of smb"); + return -EINVAL; + } else if (parm_count + *ppparm > end_of_smb) { + cFYI(1, "parm end after end of smb"); + return -EINVAL; + } else if (*ppdata > end_of_smb) { + cFYI(1, "data starts after end of smb"); + return -EINVAL; + } else if (data_count + *ppdata > end_of_smb) { + cFYI(1, "data %p + count %d (%p) past smb end %p start %p", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); + return -EINVAL; + } else if (parm_count + data_count > pSMBr->ByteCount) { + cFYI(1, "parm count and data count larger than SMB"); + return -EINVAL; + } + *pdatalen = data_count; + *pparmlen = parm_count; + return 0; +} + /* Get Security Descriptor (by handle) from remote server for a file or dir */ int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, @@ -3214,7 +3215,7 @@ setCifsAclRetry: return (rc); } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ +#endif /* CONFIG_CIFS_ACL */ /* Legacy Query Path Information call for lookup to old servers such as Win9x/WinME */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 251a17c03545..cc1a8604a790 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -105,6 +105,7 @@ struct smb_vol { unsigned int wsize; bool sockopt_tcp_nodelay:1; unsigned short int port; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ char *prepath; struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; @@ -806,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname, short int override_gid = -1; bool uid_specified = false; bool gid_specified = false; + char *nodename = utsname()->nodename; separator[0] = ','; separator[1] = 0; - if (Local_System_Name[0] != 0) - memcpy(vol->source_rfc1001_name, Local_System_Name, 15); - else { - char *nodename = utsname()->nodename; - int n = strnlen(nodename, 15); - memset(vol->source_rfc1001_name, 0x20, 15); - for (i = 0; i < n; i++) { - /* does not have to be perfect mapping since field is - informational, only used for servers that do not support - port 445 and it can be overridden at mount time */ - vol->source_rfc1001_name[i] = toupper(nodename[i]); - } - } + /* + * does not have to be perfect mapping since field is + * informational, only used for servers that do not support + * port 445 and it can be overridden at mount time + */ + memset(vol->source_rfc1001_name, 0x20, 15); + for (i = 0; i < strnlen(nodename, 15); i++) + vol->source_rfc1001_name[i] = toupper(nodename[i]); + vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ @@ -840,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* default to using server inode numbers where available */ vol->server_ino = 1; + vol->actimeo = CIFS_DEF_ACTIMEO; + if (!options) return 1; @@ -1214,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: server net" "biosname longer than 15 truncated.\n"); } + } else if (strnicmp(data, "actimeo", 7) == 0) { + if (value && *value) { + vol->actimeo = HZ * simple_strtoul(value, + &value, 0); + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cERROR(1, "CIFS: attribute cache" + "timeout too large"); + return 1; + } + } } else if (strnicmp(data, "credentials", 4) == 0) { /* ignore */ } else if (strnicmp(data, "version", 3) == 0) { @@ -1352,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname, "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { +#ifndef CONFIG_CIFS_FSCACHE + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + "kernel config option set"); + return 1; +#endif vol->fsc = true; } else if (strnicmp(data, "mfsymlinks", 10) == 0) { vol->mfsymlinks = true; @@ -2566,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cFYI(1, "file mode: 0x%x dir mode: 0x%x", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + cifs_sb->actimeo = pvolume_info->actimeo; + if (pvolume_info->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) @@ -2816,13 +2833,13 @@ remote_path_check: /* check if a whole path (including prepath) is not remote */ if (!rc && cifs_sb->prepathlen && tcon) { /* build_path_to_root works only when we have a valid tcon */ - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; } rc = is_path_accessible(xid, tcon, cifs_sb, full_path); - if (rc != -EREMOTE) { + if (rc != 0 && rc != -EREMOTE) { kfree(full_path); goto mount_fail_check; } diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 0eb87026cad3..548f06230a6d 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Search for server name delimiter */ sep = memchr(hostname, '\\', len); if (sep) - len = sep - unc; + len = sep - hostname; else cFYI(1, "%s: probably server name is whole unc: %s", __func__, unc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 06c3e83fa387..5a28660ca2b5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1108,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, return total_written; } -#ifdef CONFIG_CIFS_EXPERIMENTAL struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { @@ -1142,7 +1141,6 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, spin_unlock(&cifs_file_list_lock); return NULL; } -#endif struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) @@ -2271,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile) void cifs_oplock_break_put(struct cifsFileInfo *cfile) { + struct super_block *sb = cfile->dentry->d_sb; + cifsFileInfo_put(cfile); - cifs_sb_deactive(cfile->dentry->d_sb); + cifs_sb_deactive(sb); } const struct address_space_operations cifs_addr_ops = { diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a2ad94efcfe6..297a43d0ff7f 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -2,7 +2,7 @@ * fs/cifs/fscache.c - CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. - * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * Author(s): Suresh Jayaraman <sjayaraman@suse.de> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifsi->fscache) return; - cifsi->fscache = fscache_acquire_cookie(tcon->fscache, + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { + cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, cifsi->fscache); + } } void cifs_fscache_release_inode_cookie(struct inode *inode) @@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) { if ((filp->f_flags & O_ACCMODE) != O_RDONLY) cifs_fscache_disable_inode_cookie(inode); - else { + else cifs_fscache_enable_inode_cookie(inode); - cFYI(1, "CIFS: fscache inode cookie set"); - } } void cifs_fscache_reset_inode_cookie(struct inode *inode) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ef3a55bf86b6..589f3e3f6e00 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode, cFYI(1, "cifs_sfu_type failed: %d", tmprc); } -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - cFYI(1, "Getting mode bits from ACL"); - cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, + pfid); + if (rc) { + cFYI(1, "%s: Getting ACL failed with error: %d", + __func__, rc); + goto cgii_exit; + } } -#endif +#endif /* CONFIG_CIFS_ACL */ /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) @@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; -char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) +char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon) { int pplen = cifs_sb->prepathlen; int dfsplen; char *full_path = NULL; - struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); /* if no prefix path, simply set path to the root of share to "" */ if (pplen == 0) { @@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) char *full_path; struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - if (!inode) - return ERR_PTR(rc); + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } #ifdef CONFIG_CIFS_FSCACHE /* populate tcon->resource_id */ @@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; } else if (rc) { - kfree(full_path); - _FreeXid(xid); iget_failed(inode); - return ERR_PTR(rc); + inode = ERR_PTR(rc); } - +out: kfree(full_path); /* can not call macro FreeXid here since in a void func * TODO: This is no longer true @@ -1648,6 +1653,7 @@ static bool cifs_inode_needs_reval(struct inode *inode) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); if (cifs_i->clientCanCacheRead) return false; @@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; - /* FIXME: the actimeo should be tunable */ - if (time_after_eq(jiffies, cifs_i->time + HZ)) + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->actimeo)) return true; /* hardlinked files w/ noserverino get "special" treatment */ - if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && S_ISREG(inode->i_mode) && inode->i_nlink != 1) return true; return false; } -/* check invalid_mapping flag and zap the cache if it's set */ +/* + * Zap the cache. Called when invalid_mapping flag is set. + */ static void cifs_invalidate_mapping(struct inode *inode) { @@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { rc = 0; -#ifdef CONFIG_CIFS_EXPERIMENTAL - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) - rc = mode_to_acl(inode, full_path, mode); - else -#endif +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = mode_to_cifs_acl(inode, full_path, mode); + if (rc) { + cFYI(1, "%s: Setting ACL failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } else +#endif /* CONFIG_CIFS_ACL */ if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ef7bb7b50f58..a73eb9f4bdaf 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file) char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - struct tcon_link *tlink; + struct tcon_link *tlink = NULL; struct cifsTconInfo *pTcon; - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); - - if (file->private_data == NULL) - file->private_data = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (file->private_data == NULL) { - rc = -ENOMEM; - goto error_exit; + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cifsFile == NULL) { + rc = -ENOMEM; + goto error_exit; + } + file->private_data = cifsFile; + cifsFile->tlink = cifs_get_tlink(tlink); + pTcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + pTcon = tlink_tcon(cifsFile->tlink); } - cifsFile = file->private_data; cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - cifsFile->tlink = cifs_get_tlink(tlink); full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, ino, fattr.cf_dtype); - /* - * we can not return filldir errors to the caller since they are - * "normal" when the stat blocksize is too small - we return remapped - * error instead - * - * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above - * case already. Why should we be clobbering other errors from it? - */ - if (rc) { - cFYI(1, "filldir rc = %d", rc); - rc = -EOVERFLOW; - } dput(tmp_dentry); return rc; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index a264b744bb41..eae2a1491608 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,10 +30,11 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_USER_PREFIX "user." #define CIFS_XATTR_SYSTEM_PREFIX "system." #define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX ".security" +#define CIFS_XATTR_SECURITY_PREFIX "security." #define CIFS_XATTR_TRUSTED_PREFIX "trusted." #define XATTR_TRUSTED_PREFIX_LEN 8 #define XATTR_SECURITY_PREFIX_LEN 9 @@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); -#ifdef CONFIG_CIFS_EXPERIMENTAL - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - __u16 fid; - int oplock = 0; - struct cifs_ntsd *pacl = NULL; - __u32 buflen = 0; - if (experimEnabled) - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, GENERIC_READ, 0, &fid, - &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - /* else rc is EOPNOTSUPP from above */ - - if (rc == 0) { - rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl, - &buflen); - CIFSSMBClose(xid, pTcon, fid); - } - } -#endif /* EXPERIMENTAL */ #else - cFYI(1, "query POSIX ACL not supported yet"); + cFYI(1, "Query POSIX ACL not supported yet"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "query POSIX default ACL not supported yet"); -#endif + cFYI(1, "Query POSIX default ACL not supported yet"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + pacl = get_cifs_acl(cifs_sb, direntry->d_inode, + full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cERROR(1, "%s: error %zd getting sec desc", + __func__, rc); + } else { + if (ea_value) { + if (acllen > buf_size) + acllen = -ERANGE; + else + memcpy(ea_value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#else + cFYI(1, "Query CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); diff --git a/fs/compat.c b/fs/compat.c index c580c322fa6b..eb1740ac8c0a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/exec.c b/fs/exec.c index 99d33a1371e9..c62efcb959c7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -164,7 +164,26 @@ out: #ifdef CONFIG_MMU -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + err = insert_vm_struct(mm, vma); if (err) goto err; @@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1457,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6a5edea2d70b..94ce3d7a1c4b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -910,6 +910,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bdbe69902207..e659597b690b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, */ if (unlikely(journal_data && PageChecked(page))) err = __ext4_journalled_writepage(page, len); - else + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); + else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); if (!err) mpd->pages_written++; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 92203b8a099f..dc40e75cba88 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '0')) { + (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e32195d6aac3..fb15c9c0be74 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); + if (test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",mblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1239,8 +1241,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, @@ -1304,6 +1306,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1725,6 +1729,12 @@ set_qf_format: case Opt_nodelalloc: clear_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_mblk_io_submit: + set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; + case Opt_nomblk_io_submit: + clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; case Opt_stripe: if (match_int(&args[0], &option)) return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9242d294fe90..8b984a2cebbd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/compat.h> static const struct file_operations fuse_direct_io_file_operations; @@ -1628,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, } /* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +/* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed * to dereference the passed pointer, so the parameter requires deep @@ -1808,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - err = -EIO; - if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) - goto out; - - /* okay, copy in iovs and retry */ vaddr = kmap_atomic(pages[0], KM_USER0); - memcpy(page_address(iov_page), vaddr, transferred); + err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr, KM_USER0); + if (err) + goto out; in_iov = page_address(iov_page); out_iov = in_iov + in_iovs; + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + goto retry; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8ea4a4180a87..996dd8989a91 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static int nfs_readdir_clear_array(struct page*, gfp_t); +static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -83,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; -const struct address_space_operations nfs_dir_addr_space_ops = { - .releasepage = nfs_readdir_clear_array, +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, }; #ifdef CONFIG_NFS_V3 @@ -178,6 +178,7 @@ typedef struct { struct page *page; unsigned long page_index; u64 *dir_cookie; + u64 last_cookie; loff_t current_index; decode_dirent_t decode; @@ -213,17 +214,15 @@ void nfs_readdir_release_array(struct page *page) * we are freeing strings created by nfs_add_to_readdir_array() */ static -int nfs_readdir_clear_array(struct page *page, gfp_t mask) +void nfs_readdir_clear_array(struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array; int i; - if (IS_ERR(array)) - return PTR_ERR(array); + array = kmap_atomic(page, KM_USER0); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); - nfs_readdir_release_array(page); - return 0; + kunmap_atomic(array, KM_USER0); } /* @@ -272,7 +271,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) goto out; array->last_cookie = entry->cookie; array->size++; - if (entry->eof == 1) + if (entry->eof != 0) array->eof_index = array->size; out: nfs_readdir_release_array(page); @@ -312,15 +311,14 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; - status = 0; - goto out; + return 0; } } - if (i == array->eof_index) { - desc->eof = 1; + if (array->eof_index >= 0) { status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; } -out: return status; } @@ -328,10 +326,7 @@ static int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) { struct nfs_cache_array *array; - int status = -EBADCOOKIE; - - if (desc->dir_cookie == NULL) - goto out; + int status; array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -344,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) else status = nfs_readdir_search_for_cookie(array, desc); + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->page_index++; + } nfs_readdir_release_array(desc->page); out: return status; @@ -395,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - struct nfs_inode *node; if (dentry->d_inode == NULL) goto different; - node = NFS_I(dentry->d_inode); - if (node->fh.size != entry->fh->size) - goto different; - if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) goto different; return 1; different: @@ -494,7 +489,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; - if (desc->plus == 1) + if (desc->plus != 0) nfs_prime_dcache(desc->file->f_path.dentry, entry); status = nfs_readdir_add_to_array(entry, page); @@ -502,7 +497,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); - if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { array->eof_index = array->size; @@ -567,7 +562,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; - entry.cookie = *desc->dir_cookie; + entry.cookie = desc->last_cookie; entry.eof = 0; entry.fh = nfs_alloc_fhandle(); entry.fattr = nfs_alloc_fattr(); @@ -640,6 +635,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); page_cache_release(desc->page); desc->page = NULL; } @@ -664,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) return PTR_ERR(desc->page); res = nfs_readdir_search_array(desc); - if (res == 0) - return 0; - cache_page_release(desc); + if (res != 0) + cache_page_release(desc); return res; } @@ -676,22 +672,16 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { int res; - if (desc->page_index == 0) + if (desc->page_index == 0) { desc->current_index = 0; - while (1) { - res = find_cache_page(desc); - if (res != -EAGAIN) - break; - desc->page_index++; + desc->last_cookie = 0; } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); return res; } -static inline unsigned int dt_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -721,13 +711,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, break; } file->f_pos++; - desc->cache_entry_index = i; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; } - if (i == array->eof_index) + if (array->eof_index >= 0) desc->eof = 1; nfs_readdir_release_array(desc->page); @@ -768,6 +757,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, } desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; desc->page = page; status = nfs_readdir_xdr_to_array(desc, page, inode); @@ -795,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - int res = -ENOMEM; + int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -820,7 +810,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res < 0) goto out; - while (desc->eof != 1) { + do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { @@ -848,7 +838,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) res = nfs_do_filldir(desc, dirent, filldir); if (res < 0) break; - } + } while (!desc->eof); out: nfs_unblock_sillyrename(dentry); if (res > 0) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 60677f9f1311..7bf029ef4084 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; + unsigned int saved_type = fl->fl_type; /* Try local locking first */ posix_test_lock(filp, fl); @@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) /* found a conflict */ goto out; } + fl->fl_type = saved_type; if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 314f57164602..e67e31c73416 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index eceafe74f473..4f981f1f6689 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = { static struct rpc_version mnt_version1 = { .number = 1, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; static struct rpc_version mnt_version3 = { .number = 3, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6a653ffd8e4e..4435e5e1f904 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3361,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) return ret; @@ -3389,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); return ret; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 137b549e63db..b68536cc9046 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) { if (!nfs_lock_request_dontget(req)) return 0; - if (req->wb_page != NULL) + if (test_bit(PG_MAPPED, &req->wb_flags)) radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } @@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) */ void nfs_clear_page_tag_locked(struct nfs_page *req) { - if (req->wb_page != NULL) { + if (test_bit(PG_MAPPED, &req->wb_flags)) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e4b62c6f5a6e..aedcaa7f291f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req) (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3c045044fca2..4100630c9a5b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1069,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case Opt_v4: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1286,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case NFS4_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif default: goto out_invalid_value; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4c14c17a5276..10d648ea128b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) if (nfs_have_delegation(inode, FMODE_WRITE)) nfsi->change_attr++; } + set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; @@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&inode->i_lock); set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; if (!nfsi->npages) { @@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) iput(inode); } else spin_unlock(&inode->i_lock); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2a533a0af2a9..7e84a852cdae 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &fhp->fh_post_attr); fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; - if (err) + if (err) { fhp->fh_post_saved = 0; - else + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + } else fhp->fh_post_saved = 1; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4d476ff08ae6..60fce3dc5cb5 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 52c7557f3e25..9f26ac9be2a4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1964,8 +1964,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (reg == NULL) return ERR_PTR(-ENOMEM); - if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { + ret = -ENAMETOOLONG; + goto free; + } spin_lock(&o2hb_live_lock); reg->hr_region_num = 0; @@ -1974,7 +1976,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g O2NM_MAX_REGIONS); if (reg->hr_region_num >= O2NM_MAX_REGIONS) { spin_unlock(&o2hb_live_lock); - return ERR_PTR(-EFBIG); + ret = -EFBIG; + goto free; } set_bit(reg->hr_region_num, o2hb_region_bitmap); } @@ -1986,10 +1989,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - return ERR_PTR(ret); + goto free; } return ®->hr_item; +free: + kfree(reg); + return ERR_PTR(ret); } static void o2hb_heartbeat_group_drop_item(struct config_group *group, diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index edaded48e7e9..895532ac4d98 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -476,7 +476,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) out: iput(inode); - ocfs2_dentry_attach_gen(dentry); } /* diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 58a93b953735..cc2aaa96cfe5 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, r += O2HB_MAX_REGION_NAME_LEN; } - local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC); if (!local) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1efea3615589..70dd3b1798f1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,9 +159,9 @@ struct ocfs2_lock_res { char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; - char l_level; - char l_requested; - char l_blocking; + signed char l_level; + signed char l_requested; + signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 252e7c82f929..a5ebe421195f 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) return c; } - return c; + return NULL; } /* diff --git a/fs/proc/base.c b/fs/proc/base.c index f3d02ca461ec..182845147fe4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1574,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) if (!tmp) return -ENOMEM; - pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PAGE_SIZE); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 536d697a8a28..90d2fcb67a31 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode) struct reiserfs_transaction_handle th; size_t size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(clone->a_count)); - reiserfs_write_lock(inode->i_sb); + int depth; + + depth = reiserfs_write_lock_once(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { int error2; @@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode) if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } posix_acl_release(clone); return error; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7d287afccde5..691f61223ed6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -934,7 +934,6 @@ xfs_aops_discard_page( struct xfs_inode *ip = XFS_I(inode); struct buffer_head *bh, *head; loff_t offset = page_offset(page); - ssize_t len = 1 << inode->i_blkbits; if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; @@ -949,58 +948,14 @@ xfs_aops_discard_page( xfs_ilock(ip, XFS_ILOCK_EXCL); bh = head = page_buffers(page); do { - int done; - xfs_fileoff_t offset_fsb; - xfs_bmbt_irec_t imap; - int nimaps = 1; int error; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + xfs_fileoff_t start_fsb; if (!buffer_delay(bh)) goto next_buffer; - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi(NULL, ip, offset_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, - "page discard failed delalloc mapping lookup."); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_buffer; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_buffer; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, &done); - - ASSERT(!flist.xbf_count && !flist.xbf_first); + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { @@ -1010,7 +965,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += len; + offset += 1 << inode->i_blkbits; } while ((bh = bh->b_this_page) != head); @@ -1505,11 +1460,42 @@ xfs_vm_write_failed( struct inode *inode = mapping->host; if (to > inode->i_size) { - struct iattr ia = { - .ia_valid = ATTR_SIZE | ATTR_FORCE, - .ia_size = inode->i_size, - }; - xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); + /* + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. + */ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + truncate_pagecache(inode, to, inode->i_size); + + /* + * Check if there are any blocks that are outside of i_size + * that need to be trimmed back. + */ + start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; + end_fsb = XFS_B_TO_FSB(ip->i_mount, to); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); } } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index aa1d353def29..4c5deb6e9e31 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -488,29 +488,16 @@ found: spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - if (down_trylock(&bp->b_sema)) { + if (xfs_buf_cond_lock(bp)) { + /* failed, so wait for the lock if requested. */ if (!(flags & XBF_TRYLOCK)) { - /* wait for buffer ownership */ xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this buffer at least overlaps our - * buffer and is locked, therefore our buffer - * either does not exist, or is this buffer. - */ xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } - } else { - /* trylock worked */ - XB_SET_OWNER(bp); } if (bp->b_flags & XBF_STALE) { @@ -876,10 +863,18 @@ xfs_buf_rele( */ /* - * Locks a buffer object, if it is not already locked. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Locks a buffer object, if it is not already locked. Note that this in + * no way locks the underlying pages, so it is only useful for + * synchronizing concurrent use of buffer objects, not for synchronizing + * independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. */ int xfs_buf_cond_lock( @@ -890,6 +885,8 @@ xfs_buf_cond_lock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); + else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_cond_lock(bp, _RET_IP_); return locked ? 0 : -EBUSY; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e32e13..4111cd3966c7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5471,8 +5471,13 @@ xfs_getbmap( if (error) goto out_unlock_iolock; } - - ASSERT(ip->i_delayed_blks == 0); + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ } lock = xfs_ilock_map_shared(ip); @@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves( *count += xfs_bmbt_disk_get_blockcount(frp); } } + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will alays punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, start_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 71ec9b6ecdfc..3651191daea1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -394,6 +394,11 @@ xfs_bmap_count_blocks( int whichfork, int *count); +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c60a22..e60490bc00a6 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -377,6 +377,19 @@ xfs_swap_extents( ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed9990267661..c78cc6a3d87c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ xfs_error_trap(int e) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; int xfs_error_test(int error_tag, int *fsidp, char *expression, @@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) len = strlen(mp->m_fsname); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; return 0; } } @@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) xfs_etest_fsid[i] = 0LL; kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; } } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c2c1a072bb82..f338847f80b8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level, #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #ifdef DEBUG +extern int xfs_error_test_active; extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); #define XFS_NUM_INJECT_ERROR 10 #define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ + ((expr) || (xfs_error_test_active && \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf))) + (rf)))) extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c7ac020705df..7c8d30c453c3 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -657,18 +657,37 @@ xfs_inode_item_unlock( } /* - * This is called to find out where the oldest active copy of the - * inode log item in the on disk log resides now that the last log - * write of it completed at the given lsn. Since we always re-log - * all dirty data in an inode, the latest copy in the on disk log - * is the only one that matters. Therefore, simply return the - * given lsn. + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completions before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, return a lower LSN than the one passed in so that the + * transaction committed code will not move the inode forward in the AIL but + * will still unpin it properly. */ STATIC xfs_lsn_t xfs_inode_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) + return lsn - 1; return lsn; } diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d2af0a8381a6..77a59891734e 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -297,6 +297,7 @@ xfs_rename( * it and some incremental backup programs won't work without it. */ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* * Adjust the link count on src_dp. This is necessary when |