From 357b66fdc8ad4cea6e6336956a70742f961f0a4d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:34:34 -0500 Subject: ext4: ext4_split_extent should take care of extent zeroout When ext4_split_extent_at() ends up doing zeroout & conversion to initialized instead of split & conversion, ext4_split_extent() gets confused and can wrongly mark the extent back as uninitialized resulting in end IO code getting confused from large unwritten extents and may result in data loss. The example of problematic behavior is: lblk len lblk len ext4_split_extent() (ex=[1000,30,uninit], map=[1010,10]) ext4_split_extent_at() (split [1000,30,uninit] at 1020) ext4_ext_insert_extent() -> ENOSPC ext4_ext_zeroout() -> extent [1000,30] is now initialized ext4_split_extent_at() (split [1000,30,init] at 1010, MARK_UNINIT1 | MARK_UNINIT2) -> extent is split and parts marked as uninitialized Fix the problem by rechecking extent type after the first ext4_split_extent_at() returns. None of split_flags can not be applied to initialized extent so this patch also add BUG_ON to prevent similar issues in future. TESTCASE: https://github.com/dmonakhov/xfstests/commit/b8a55eb5ce28c6ff29e620ab090902fcd5833597 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 372b2cbee07e..bef194a14437 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2943,6 +2943,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3061,19 +3065,26 @@ static int ext4_split_extent(handle_t *handle, if (err) goto out; } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) -- cgit v1.2.3 From ec22ba8edb507395c95fbc617eea26a6b2d98797 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:36:06 -0500 Subject: ext4: disable merging of uninitialized extents Derived from Jan's patch:http://permalink.gmane.org/gmane.comp.file-systems.ext4/36470 Merging of uninitialized extents creates all sorts of interesting race possibilities when writeback / DIO races with fallocate. Thus ext4_convert_unwritten_extents_endio() has to deal with a case where extent to be converted needs to be split out first. That isn't nice for two reasons: 1) It may need allocation of extent tree block so ENOSPC is possible. 2) It complicates end_io handling code So we disable merging of uninitialized extents which allows us to simplify the code. Extents will get merged after they are converted to initialized ones. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bef194a14437..60818ed1f6a9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) -- cgit v1.2.3 From ff95ec22cd7faa0d8b58dcc4207f21502df7b00b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:41:05 -0500 Subject: ext4: add warning to ext4_convert_unwritten_extents_endio Splitting extents inside endio is a bad thing, but unfortunately it is still possible. In fact we are pretty close to the moment when all related issues will be fixed. Let's warn developer if it still the case. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 60818ed1f6a9..265cb0e50c51 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3387,8 +3387,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) -- cgit v1.2.3 From de99fcce1da7933a90198b80a2e896754ea3bdc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Mar 2013 00:43:32 -0500 Subject: ext4: remove unnecessary wait for extent conversion in ext4_fallocate() Now that we don't merge uninitialized extents anymore, ext4_fallocate() is free to operate on the inode while there are still some extent conversions pending - it won't disturb them in any way. Reviewed-by: Zheng Liu Reviewed-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 265cb0e50c51..25c86aaa38d6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4392,8 +4392,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; -- cgit v1.2.3 From 6ca470d7b5e7639b7925b3202e796282703b6d5d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:50:47 -0500 Subject: ext4: invalidate extent status tree during extent migration mext_replace_branches() will change inode's extents layout so we have to drop corresponding cache. TESTCASE: 301'th xfstest was not yet accepted to official xfstest's branch and can be found here: https://github.com/dmonakhov/xfstests/commit/7b7efeee30a41109201e2040034e71db9b66ddc0 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/move_extent.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d78c33eed7e5..c1f15b203e98 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -666,6 +666,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) -- cgit v1.2.3 From bd384364c1185ecb01f3b8242c915ccb5921c60d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 20:48:59 -0400 Subject: ext4: avoid a potential overflow in ext4_es_can_be_merged() Check the length of an extent to avoid a potential overflow in ext4_es_can_be_merged(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents_status.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b7522..37f9a2d8fd04 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) + if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; - return 1; + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * -- cgit v1.2.3 From 921f266bc6bfe6ebb599c559f10443af314c19ec Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 10 Mar 2013 21:01:03 -0400 Subject: ext4: add self-testing infrastructure to do a sanity check This commit adds a self-testing infrastructure like extent tree does to do a sanity check for extent status tree. After status tree is as a extent cache, we'd better to make sure that it caches right result. After applied this commit, we will get a lot of messages when we run xfstests as below. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... kernel: ES cache assertation failed for inode: 230 es_cached ex [974/2/4781/20] != found ex [974/1/4781/1000] ... kernel: ES insert assertation failed for inode: 635 ex_status [0/45/21388/w] != es_status [44/1/21432/u] ... Signed-off-by: Dmitry Monakhov Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 6 ++ fs/ext4/inode.c | 96 ++++++++++++++++++++++++++ 3 files changed, 277 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 37f9a2d8fd04..d2a8cb74676b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -399,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -481,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969da..56140ad4150b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,6 +20,12 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + /* * These flags live in the high bits of extent_status.es_pblk */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 95a0c62c5683..3186a43fa4b0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -482,6 +482,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +561,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +588,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +612,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +713,15 @@ found: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -1768,6 +1847,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1893,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1873,6 +1960,15 @@ add_delayed: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3 From cdee78433c138c2f2018a6884673739af2634787 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:08:52 -0400 Subject: ext4: fix wrong m_len value after unwritten extent conversion The ext4_ext_handle_uninitialized_extents() function was assuming the return value of ext4_ext_map_blocks() is equal to map->m_len. This incorrect assumption was harmless until we started use status tree as a extent cache because we need to update status tree according to 'm_len' value. Meanwhile this commit marks EXT4_MAP_MAPPED flag after unwritten extent conversion. It shouldn't cause a bug because we update status tree according to checking EXT4_MAP_UNWRITTEN flag. But it should be fixed. After applied this commit, the following error message from self-testing infrastructure disappears. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 25c86aaa38d6..110e85a1f82a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3650,6 +3650,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ -- cgit v1.2.3 From adb2355104b2109e06ba5276485d187d023b2fd2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:13:05 -0400 Subject: ext4: update extent status tree after an extent is zeroed out When we try to split an extent, this extent could be zeroed out and mark as initialized. But we don't know this in ext4_map_blocks because it only returns a length of allocated extent. Meanwhile we will mark this extent as uninitialized because we only check m_flags. This commit update extent status tree when we try to split an unwritten extent. We don't need to worry about the status of this extent because we always mark it as initialized. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 35 +++++++++++++++++++++++++++++++---- fs/ext4/extents_status.c | 17 +++++++++++++++++ fs/ext4/extents_status.h | 3 +++ fs/ext4/inode.c | 10 ++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 110e85a1f82a..7e37018d1753 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2925,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2996,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3009,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3150,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3247,6 +3268,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3305,6 +3329,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d2a8cb74676b..fe3337a85ede 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -854,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 56140ad4150b..d8e2d4dc311e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -64,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3186a43fa4b0..4f1d54a88d8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -722,6 +722,15 @@ found: } #endif + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -734,6 +743,7 @@ found: retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); -- cgit v1.2.3 From 3a2256702e47f68f921dfad41b1764d05c572329 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:20:23 -0400 Subject: ext4: fix the wrong number of the allocated blocks in ext4_split_extent() This commit fixes a wrong return value of the number of the allocated blocks in ext4_split_extent. When the length of blocks we want to allocate is greater than the length of the current extent, we return a wrong number. Let's see what happens in the following case when we call ext4_split_extent(). map: [48, 72] ex: [32, 64, u] 'ex' will be split into two parts: ex1: [32, 47, u] ex2: [48, 64, w] 'map->m_len' is returned from this function, and the value is 24. But the real length is 16. So it should be fixed. Meanwhile in this commit we use right length of the allocated blocks when get_reserved_cluster_alloc in ext4_ext_handle_uninitialized_extents is called. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e37018d1753..69df02ff96aa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3067,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3086,6 +3087,8 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may @@ -3115,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3730,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already -- cgit v1.2.3 From e1c36595bedc2e1b4112f01256cb30f4d9f9ae46 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 10 Mar 2013 22:19:00 -0400 Subject: ext4: fix WARN_ON from ext4_releasepage() ext4_releasepage() warns when it is passed a page with PageChecked set. However this can correctly happen when invalidate_inode_pages2_range() invalidates pages - and we should fail the release in that case. Since the page was dirty anyway, it won't be discarded and no harm has happened but it's good to be safe. Also remove bogus page_has_buffers() check - we are guaranteed page has buffers in this function. Reported-by: Zheng Liu Tested-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4f1d54a88d8c..117a9e7aa4a0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3018,8 +3018,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); -- cgit v1.2.3 From e3d85c366089015805f175324bb1780249f44669 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:21:49 -0400 Subject: ext4: remove unused variable in ext4_free_blocks() Remove unused variable 'freed' in ext4_free_blocks(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe4..75e05f3a730f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4464,7 +4464,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4672,8 +4671,6 @@ do_more: ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); -- cgit v1.2.3 From bb8b20ed94bc69120e31399c43cb336300dea109 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:28:09 -0400 Subject: ext4: do not use yield() Using yield() is strongly discouraged (see sched/core.c) especially since we can just use cond_resched(). Replace all use of yield() with cond_resched(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 117a9e7aa4a0..48fc023ab0a2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1352,7 +1352,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 75e05f3a730f..8b2ea9f75004 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3692,11 +3692,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { -- cgit v1.2.3 From 232ec8720d4e45405e37144c67053042c6b886d3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:46:30 -0400 Subject: ext4: update reserved space after the 'correction' Currently in ext4_ext_map_blocks() in delayed allocation writeback we would update the reservation and after that check whether we claimed cluster outside of the range of the allocation and if so, we'll give the block back to the reservation pool. However this also means that if the number of reserved data block dropped to zero before the correction, we would release all the metadata reservation as well, however we might still need it because the we're not done with the delayed allocation and there might be more blocks to come. This will result in error messages such as: EXT4-fs warning (device sdb): ext4_da_update_reserve_space:361: ino 12, allocated 1 with only 0 reserved metadata blocks (releasing 1 blocks with reserved 1 data blocks) This will only happen on bigalloc file system and it can be easily reproduced using fiemap-tester from xfstests like this: ./src/fiemap-tester -m DHDHDHDHD -S -p0 /mnt/test/file Or using xfstests such as 225. Fix this by doing the correction first and updating the reservation after that so that we do not accidentally decrease i_reserved_data_blocks to zero. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 69df02ff96aa..bd69e906bd91 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4165,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4218,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } -- cgit v1.2.3 From 386ad67c9ac043890121c066186883d1640348a4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:50:00 -0400 Subject: ext4: reserve metadata block for every delayed write Currently we only reserve space (data+metadata) in delayed allocation if we're allocating from new cluster (which is always in non-bigalloc file system) which is ok for data blocks, because we reserve the whole cluster. However we have to reserve metadata for every delayed block we're going to write because every block could potentially require metedata block when we need to grow the extent tree. Signed-off-by: Lukas Czerner --- fs/ext4/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48fc023ab0a2..65bbc9339aca 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1304,6 +1304,55 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + /* * Reserve a single cluster located at lblock */ @@ -1940,8 +1989,11 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1949,6 +2001,13 @@ add_delayed: retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3 From ad56edad089b56300fd13bb9eeb7d0424d978239 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2013 13:24:56 -0400 Subject: jbd2: fix use after free in jbd2_journal_dirty_metadata() jbd2_journal_dirty_metadata() didn't get a reference to journal_head it was working with. This is OK in most of the cases since the journal head should be attached to a transaction but in rare occasions when we are journalling data, __ext4_journalled_writepage() can race with jbd2_journal_invalidatepage() stripping buffers from a page and thus journal head can be freed under hands of jbd2_journal_dirty_metadata(). Fix the problem by getting own journal head reference in jbd2_journal_dirty_metadata() (and also in jbd2_journal_set_triggers() which can possibly have the same issue). Reported-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/transaction.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b1..325bc019ed88 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ -- cgit v1.2.3 From 90ba983f6889e65a3b506b30dc606aa9d1d46cd2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 11 Mar 2013 23:39:59 -0400 Subject: ext4: use atomic64_t for the per-flexbg free_clusters count A user who was using a 8TB+ file system and with a very large flexbg size (> 65536) could cause the atomic_t used in the struct flex_groups to overflow. This was detected by PaX security patchset: http://forums.grsecurity.net/viewtopic.php?f=3&t=3289&p=12551#p12551 This bug was introduced in commit 9f24e4208f7e, so it's been around since 2.6.30. :-( Fix this by using an atomic64_t for struct orlav_stats's free_clusters. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 6 +++--- fs/ext4/ialloc.c | 4 ++-- fs/ext4/mballoc.c | 12 ++++++------ fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba315262..167ff564bbfa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075dd..6c5bb8d993fe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b2ea9f75004..ee6614bdb639 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4661,8 +4661,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4804,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb98..c169477a62c9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9379b7fbfd92..d1ee6a84338a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1923,8 +1923,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } -- cgit v1.2.3 From 4f42f80a8f08d4c3f52c4267361241885d5dee3a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 12 Mar 2013 12:40:04 -0400 Subject: ext4: use s_extent_max_zeroout_kb value as number of kb Currently when converting extent to initialized, we have to decide whether to zeroout part/all of the uninitialized extent in order to avoid extent tree growing rapidly. The decision is made by comparing the size of the extent with the configurable value s_extent_max_zeroout_kb which is in kibibytes units. However when converting it to number of blocks we currently use it as it was in bytes. This is obviously bug and it will result in ext4 _never_ zeroout extents, but rather always split and convert parts to initialized while leaving the rest uninitialized in default setting. Fix this by using s_extent_max_zeroout_kb as kibibytes. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd69e906bd91..e2bb929bea93 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3264,7 +3264,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { -- cgit v1.2.3 From 47c78f4a70d791ff44cab3254b489605a52e3181 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 11 Mar 2013 13:08:49 +0000 Subject: cifs: map NT_STATUS_SHARING_VIOLATION to EBUSY instead of ETXTBSY NT_SHARING_VIOLATION errors are mapped to ETXTBSY which is unexpected for operations such as unlink where we can hit these errors. The patch maps the error NT_SHARING_VIOLATION to EBUSY instead. The patch also replaces all instances of ETXTBSY in cifs_rename_pending_delete() with EBUSY. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 10 ++++------ fs/cifs/netmisc.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0079696305c9..20887bf63121 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1043,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1062,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1169,15 +1169,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1518,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc82..c0b25b28be6c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, -- cgit v1.2.3 From 24261fc23db950951760d00c188ba63cc756b932 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 8 Mar 2013 16:30:03 +0100 Subject: cifs: delay super block destruction until all cifsFileInfo objects are gone cifsFileInfo objects hold references to dentries and it is possible that these will still be around in workqueues when VFS decides to kill super block during unmount. This results in panics like this one: BUG: Dentry ffff88001f5e76c0{i=66b4a,n=1M-2} still in use (1) [unmount of cifs cifs] ------------[ cut here ]------------ kernel BUG at fs/dcache.c:943! [..] Process umount (pid: 1781, threadinfo ffff88003d6e8000, task ffff880035eeaec0) [..] Call Trace: [] shrink_dcache_for_umount+0x33/0x60 [] generic_shutdown_super+0x2c/0xe0 [] kill_anon_super+0x16/0x30 [] cifs_kill_sb+0x1a/0x30 [cifs] [] deactivate_locked_super+0x57/0x80 [] deactivate_super+0x4e/0x70 [] mntput_no_expire+0xd7/0x130 [] sys_umount+0x9c/0x3c0 [] system_call_fastpath+0x16/0x1b Fix this by making each cifsFileInfo object hold a reference to cifs super block, which implicitly keeps VFS super block around as well. Signed-off-by: Mateusz Guzik Reviewed-by: Jeff Layton Cc: Reported-and-Tested-by: Ben Greear Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 24 ++++++++++++++++++++++++ fs/cifs/cifsfs.h | 4 ++++ fs/cifs/file.c | 6 +++++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8e..054b90b682a7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd9..0e32c3446ce9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d85577314..7a0dd99e4507 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. @@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } -- cgit v1.2.3 From 0e401101db49959f5783f6ee9e676124b5a183ac Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Mar 2013 11:40:19 -0400 Subject: ext4: fix memory leakage in mext_check_coverage Regression was introduced by following commit 8c854473 TESTCASE (git://oss.sgi.com/xfs/cmds/xfstests.git): #while true;do ./check 301 || break ;done Also fix potential memory leakage in get_ext_path() once ext4_ext_find_extent() have failed. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c1f15b203e98..bbae4ed15c3d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** -- cgit v1.2.3 From 83cdadd8b0559c93728d065d23ca3485fa567e54 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Feb 2013 13:32:56 -0500 Subject: xfs: fix potential infinite loop in xfs_iomap_prealloc_size() If freesp == 0, we could end up in an infinite loop while squashing the preallocation. Break the loop when we've killed the prealloc entirely. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e78c420bfc2608bb5f9a0b9165b1071c1e31166a) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 912d83d8860a..b0b0f448e843 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -413,7 +413,7 @@ xfs_iomap_prealloc_size( * have a large file on a small filesystem and the above * lowspace thresholds are smaller than MAXEXTLEN. */ - while (alloc_blocks >= freesp) + while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; } -- cgit v1.2.3 From 3325beed46d8d14d873e94d89ea57ee900dec942 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Sun, 24 Feb 2013 13:04:37 -0600 Subject: xfs: fix xfs_iomap_eof_prealloc_initial_size type Fix the return type of xfs_iomap_eof_prealloc_initial_size() to xfs_fsblock_t to reflect the fact that the return value may be an unsigned 64 bits if XFS_BIG_BLKNOS is defined. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e8108cedb1c5d1dc359690d18ca997e97a0061d2) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b0b0f448e843..5a30dd899d2b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -325,7 +325,7 @@ xfs_iomap_eof_want_preallocate( * rather than falling short due to things like stripe unit/width alignment of * real extents. */ -STATIC int +STATIC xfs_fsblock_t xfs_iomap_eof_prealloc_initial_size( struct xfs_mount *mp, struct xfs_inode *ip, -- cgit v1.2.3 From e001873853d87674dd5b3cfa2851885023616695 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Mar 2013 23:30:34 +1100 Subject: xfs: ensure we capture IO errors correctly Failed buffer readahead can leave the buffer in the cache marked with an error. Most callers that then issue a subsequent read on the buffer do not zero the b_error field out, and so we may incorectly detect an error during IO completion due to the stale error value left on the buffer. Avoid this problem by zeroing the error before IO submission. This ensures that the only IO errors that are detected those captured from are those captured from bio submission or completion. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit c163f9a1760229a95d04e37b332de7d5c1c225cd) --- fs/xfs/xfs_buf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4e8f0df82d02..8459b5d8cb71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1334,6 +1334,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; -- cgit v1.2.3 From a517b608fa3d9b65930ef53ffe4a2f9800e10f7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Mar 2013 10:49:07 -0400 Subject: nfsd: only unhash DRC entries that are in the hashtable It's not safe to call hlist_del() on a newly initialized hlist_node. That leads to a NULL pointer dereference. Only do that if the entry is hashed. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128aeb..18509bd6f587 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -102,7 +102,8 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); - hlist_del(&rp->c_hash); + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; kmem_cache_free(drc_slab, rp); -- cgit v1.2.3 From ac534ff2d5508bdff1358a55d88053da729ff46b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 15 Mar 2013 09:16:29 -0400 Subject: nfsd: fix startup order in nfsd_reply_cache_init If we end up doing "goto out_nomem" in this function, we'll call nfsd_reply_cache_shutdown. That will attempt to walk the LRU list and free entries, but that list may not be initialized yet if the server is starting up for the first time. It's also possible for the shrinker to kick in before we've initialized the LRU list. Rearrange the initialization so that the LRU list_head and cache size are initialized before doing any of the allocations that might fail. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 18509bd6f587..ca05f6dc3544 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -119,6 +119,10 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); @@ -129,10 +133,6 @@ int nfsd_reply_cache_init(void) if (!cache_hash) goto out_nomem; - INIT_LIST_HEAD(&lru_head); - max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; - return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); -- cgit v1.2.3 From 1ada47d9468fe3907f7f9e00179168f5e2f90803 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:39:42 -0400 Subject: ext4: fix ext4_evict_inode() racing against workqueue processing code Commit 84c17543ab56 (ext4: move work from io_end to inode) triggered a regression when running xfstest #270 when the file system is mounted with dioread_nolock. The problem is that after ext4_evict_inode() calls ext4_ioend_wait(), this guarantees that last io_end structure has been freed, but it does not guarantee that the workqueue structure, which was moved into the inode by commit 84c17543ab56, is actually finished. Once ext4_flush_completed_IO() calls ext4_free_io_end() on CPU #1, this will allow ext4_ioend_wait() to return on CPU #2, at which point the evict_inode() codepath can race against the workqueue code on CPU #1 accessing EXT4_I(inode)->i_unwritten_work to find the next item of work to do. Fix this by calling cancel_work_sync() in ext4_ioend_wait(), which will be renamed ext4_ioend_shutdown(), since it is only used by ext4_evict_inode(). Also, move the call to ext4_ioend_shutdown() until after truncate_inode_pages() and filemap_write_and_wait() are called, to make sure all dirty pages have been written back and flushed from the page cache first. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cwq_activate_delayed_work+0x3b/0x7e *pdpt = 0000000030bc3001 *pde = 0000000000000000 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: Pid: 6, comm: kworker/u:0 Not tainted 3.8.0-rc3-00013-g84c1754-dirty #91 Bochs Bochs EIP: 0060:[] EFLAGS: 00010046 CPU: 0 EIP is at cwq_activate_delayed_work+0x3b/0x7e EAX: 00000000 EBX: 00000000 ECX: f505fe54 EDX: 00000000 ESI: ed5b697c EDI: 00000006 EBP: f64b7e8c ESP: f64b7e84 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: 00000000 CR3: 30bc2000 CR4: 000006f0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process kworker/u:0 (pid: 6, ti=f64b6000 task=f64b4160 task.ti=f64b6000) Stack: f505fe00 00000006 f64b7e9c c01de3d7 f6435540 00000003 f64b7efc c01def1d f6435540 00000002 00000000 0000008a c16d0808 c040a10b c16d07d8 c16d08b0 f505fe00 c16d0780 00000000 00000000 ee153df4 c1ce4a30 c17d0e30 00000000 Call Trace: [] cwq_dec_nr_in_flight+0x71/0xfb [] process_one_work+0x5d8/0x637 [] ? ext4_end_bio+0x300/0x300 [] worker_thread+0x249/0x3ef [] kthread+0xd8/0xeb [] ? manage_workers+0x4bb/0x4bb [] ? trace_hardirqs_on+0x27/0x37 [] ret_from_kernel_thread+0x1b/0x28 [] ? __init_kthread_worker+0x71/0x71 Code: 01 83 15 ac ff 6c c1 00 31 db 89 c6 8b 00 a8 04 74 12 89 c3 30 db 83 05 b0 ff 6c c1 01 83 15 b4 ff 6c c1 00 89 f0 e8 42 ff ff ff <8b> 13 89 f0 83 05 b8 ff 6c c1 6c c1 00 31 c9 83 EIP: [] cwq_activate_delayed_work+0x3b/0x7e SS:ESP 0068:f64b7e84 CR2: 0000000000000000 ---[ end trace a1923229da53d8a4 ]--- Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 12 +++++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 167ff564bbfa..3b83cd604796 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 65bbc9339aca..ea5f24ffa60c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -216,6 +214,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +224,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ecc..047a6de04a0a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) -- cgit v1.2.3 From 2b405bfa84063bfa35621d2d6879f52693c614b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:42:11 -0400 Subject: ext4: fix data=journal fast mount/umount hang In data=journal mode, if we unmount the file system before a transaction has a chance to complete, when the journal inode is being evicted, we can end up calling into jbd2_log_wait_commit() for the last transaction, after the journalling machinery has been shut down. Arguably we should adjust ext4_should_journal_data() to return FALSE for the journal inode, but the only place it matters is ext4_evict_inode(), and so to save a bit of CPU time, and to make the patch much more obviously correct by inspection(tm), we'll fix it by explicitly not trying to waiting for a journal commit when we are evicting the journal inode, since it's guaranteed to never succeed in this case. This can be easily replicated via: mount -t ext4 -o data=journal /dev/vdb /vdb ; umount /vdb ------------[ cut here ]------------ WARNING: at /usr/projects/linux/ext4/fs/jbd2/journal.c:542 __jbd2_log_start_commit+0xba/0xcd() Hardware name: Bochs JBD2: bad log_start_commit: 3005630206 3005630206 0 0 Modules linked in: Pid: 2909, comm: umount Not tainted 3.8.0-rc3 #1020 Call Trace: [] warn_slowpath_common+0x68/0x7d [] ? __jbd2_log_start_commit+0xba/0xcd [] warn_slowpath_fmt+0x2b/0x2f [] __jbd2_log_start_commit+0xba/0xcd [] jbd2_log_start_commit+0x24/0x34 [] ext4_evict_inode+0x71/0x2e3 [] evict+0x94/0x135 [] iput+0x10a/0x110 [] jbd2_journal_destroy+0x190/0x1ce [] ? bit_waitqueue+0x50/0x50 [] ext4_put_super+0x52/0x294 [] generic_shutdown_super+0x48/0xb4 [] kill_block_super+0x22/0x60 [] deactivate_locked_super+0x22/0x49 [] deactivate_super+0x30/0x33 [] mntput_no_expire+0x107/0x10c [] sys_umount+0x2cf/0x2e0 [] sys_oldumount+0x12/0x14 [] syscall_call+0x7/0xb ---[ end trace 6a954cc790501c1f ]--- jbd2_log_wait_commit: error: j_commit_request=-1289337090, tid=0 Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea5f24ffa60c..85e41a2a39ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; -- cgit v1.2.3 From cf4ab538f1516606d3ae730dce15d6f33d96b7e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Mar 2013 12:56:37 -0500 Subject: NFSv4: Fix the string length returned by the idmapper Functions like nfs_map_uid_to_name() and nfs_map_gid_to_group() are expected to return a string without any terminating NUL character. Regression introduced by commit 57e62324e469e092ecc6c94a7a86fe4bd6ac5172 (NFS: Store the legacy idmapper result in the keyring). Reported-by: Dave Chiluk Signed-off-by: Trond Myklebust Cc: Bryan Schumaker Cc: stable@vger.kernel.org [>=3.4] --- fs/nfs/idmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa717..c516da5873fd 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; -- cgit v1.2.3 From 991f76f837bf22c5bb07261cfd86525a0a96650c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:24 +0800 Subject: sysfs: fix race between readdir and lseek While readdir() is running, lseek() may set filp->f_pos as zero, then may leave filp->private_data pointing to one sysfs_dirent object without holding its reference counter, so the sysfs_dirent object may be used after free in next readdir(). This patch holds inode->i_mutex to avoid the problem since the lock is always held in readdir path. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..c9e16608f486 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1058,10 +1058,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; -- cgit v1.2.3 From e5110f411d2ee35bf8d202ccca2e89c633060dca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:25 +0800 Subject: sysfs: handle failure path correctly for readdir() In case of 'if (filp->f_pos == 0 or 1)' of sysfs_readdir(), the failure from filldir() isn't handled, and the reference counter of the sysfs_dirent object pointed by filp->private_data will be released without clearing filp->private_data, so use after free bug will be triggered later. This patch returns immeadiately under the situation for fixing the bug, and it is reasonable to return from readdir() when filldir() fails. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c9e16608f486..e14512678c9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); -- cgit v1.2.3 From 4376c94618c26225e69e17b7c91169c45a90b292 Mon Sep 17 00:00:00 2001 From: fanchaoting Date: Thu, 21 Mar 2013 09:15:30 +0800 Subject: pnfs-block: removing DM device maybe cause oops when call dev_remove when pnfs block using device mapper,if umounting later,it maybe cause oops. we apply "1 + sizeof(bl_umount_request)" memory for msg->data, the memory maybe overflow when we do "memcpy(&dataptr [sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request))", because the size of bl_msg is more than 1 byte. Signed-off-by: fanchaoting Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayoutdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17b..6fc7b5cae92b 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { -- cgit v1.2.3 From a073dbff359f4741013ae4b8395f5364c5e00b48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 12:34:32 -0400 Subject: NFSv4.1: Fix a race in pNFS layoutcommit We need to clear the NFS_LSEG_LAYOUTCOMMIT bits atomically with the NFS_INO_LAYOUTCOMMIT bit, otherwise we may end up with situations where the two are out of sync. The first half of the problem is to ensure that pnfs_layoutcommit_inode clears the NFS_LSEG_LAYOUTCOMMIT bit through pnfs_list_write_lseg. We still need to keep the reference to those segments until the RPC call is finished, so in order to make it clear _where_ those references come from, we add a helper pnfs_list_write_lseg_done() that cleans up after pnfs_list_write_lseg. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 14 -------------- fs/nfs/pnfs.c | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f901..6ccdd4fd9b59 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6416,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad6258..3d900916fd41 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1746,11 +1746,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1811,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* -- cgit v1.2.3 From 24956804349ca0eadcdde032d65e8c00b4214096 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:03:00 -0400 Subject: NFSv4.1: Always clear the NFS_INO_LAYOUTCOMMIT in layoutreturn Note that clearing NFS_INO_LAYOUTCOMMIT is tricky, since it requires you to also clear the NFS_LSEG_LAYOUTCOMMIT bits from the layout segments. The only two sites that need to do this are the ones that call pnfs_return_layout() without first doing a layout commit. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/pnfs.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c109..4fb234d3aefb 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3d900916fd41..5044142c1216 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -777,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -808,6 +830,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -820,8 +843,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -1458,7 +1479,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1633,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) -- cgit v1.2.3 From 240286725d854331422cb15957f8d9bf2741d4e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:23:33 -0400 Subject: NFSv4.1: Add a helper pnfs_commit_and_return_layout In order to be able to safely return the layout in nfs4_proc_setattr, we need to block new uses of the layout, wait for all outstanding users of the layout to complete, commit the layout and then return it. This patch adds a helper in order to do all this safely. Signed-off-by: Trond Myklebust Cc: Boaz Harrosh --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 27 +++++++++++++++++++++++++++ fs/nfs/pnfs.h | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6ccdd4fd9b59..26431cf62ddb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2632,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5044142c1216..4bdffe0ba025 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -866,6 +866,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba80417748..f5f8a470a647 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { -- cgit v1.2.3 From 06ae43f34bcc07a0b6be8bf78a1c895bcd12c839 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Mar 2013 13:19:30 -0400 Subject: Don't bother with redoing rw_verify_area() from default_file_splice_from() default_file_splice_from() ends up calling vfs_write() (via very convoluted callchain). It's an overkill, since we already have done rw_verify_area() in the caller by the time we call vfs_write() we are under set_fs(KERNEL_DS), so access_ok() is also pointless. Add a new helper (__kernel_write()), use it instead of kernel_write() in there. Signed-off-by: Al Viro --- fs/internal.h | 5 +++++ fs/read_write.c | 25 +++++++++++++++++++++++++ fs/splice.c | 4 +++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 507141fceb99..4be78237d896 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457fb..f7b5a23b804b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include #include #include "read_write.h" +#include "internal.h" #include #include @@ -417,6 +418,30 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; diff --git a/fs/splice.c b/fs/splice.c index 718bd0056384..29e394e49ddd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include #include #include +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1048,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; -- cgit v1.2.3 From f853c616883a8de966873a1dab283f1369e275a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Mar 2013 09:52:19 -0400 Subject: cifs: ignore everything in SPNEGO blob after mechTypes We've had several reports of people attempting to mount Windows 8 shares and getting failures with a return code of -EINVAL. The default sec= mode changed recently to sec=ntlmssp. With that, we expect and parse a SPNEGO blob from the server in the NEGOTIATE reply. The current decode_negTokenInit function first parses all of the mechTypes and then tries to parse the rest of the negTokenInit reply. The parser however currently expects a mechListMIC or nothing to follow the mechTypes, but Windows 8 puts a mechToken field there instead to carry some info for the new NegoEx stuff. In practice, we don't do anything with the fields after the mechTypes anyway so I don't see any real benefit in continuing to parse them. This patch just has the kernel ignore the fields after the mechTypes. We'll probably need to reinstate some of this if we ever want to support NegoEx. Reported-by: Jason Burgess Reported-by: Yan Li Signed-off-by: Jeff Layton Cc: Signed-off-by: Steve French --- fs/cifs/asn1.c | 53 +++++------------------------------------------------ 1 file changed, 5 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0bc..1d36db114772 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length, } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } -- cgit v1.2.3 From d763448286377b8a0e3f179372e9e292bef3c337 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:00 +0000 Subject: Btrfs: update to use fs_state bit Now that we use bit operation to check fs_state, update btrfs_free_fs_root()'s checker, otherwise we get back to memory leak case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d84651e850b..127b23e8323b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3253,7 +3253,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } -- cgit v1.2.3 From 835d974fabfa9bff4d173ad03c054ac2f673263f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Mar 2013 12:13:25 -0400 Subject: Btrfs: handle a bogus chunk tree nicely If you restore a btrfs-image file system and try to mount that file system we'll panic. That's because btrfs-image restores and just makes one big chunk to envelope the whole disk, since they are really only meant to be messed with by our btrfs-progs. So fix up btrfs_rmap_block and the callers of it for mount so that we no longer panic but instead just return an error and fail to mount. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/volumes.c | 13 ++++++++++++- 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 350b9b18140c..a8ff25aedca1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -7964,7 +7969,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8106,7 +8121,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f7..2854c824ab64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4935,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; -- cgit v1.2.3 From 6113077cd319e747875ec71227d2b5cb54e08c76 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 19 Mar 2013 10:57:14 +0000 Subject: Btrfs: fix missing qgroup reservation before fallocating Steps to reproduce: mkfs.btrfs mount btrfs quota enable btrfs sub create /subv btrfs qgroup limit 10M /subv fallocate --length 20M /subv/data For the above example, fallocating will return successfully which is not expected, we try to fix it by doing qgroup reservation before fallocating. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7bdb47faa12e..1be25b92d63c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2142,6 +2142,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file->f_path.dentry->d_inode; struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2169,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2272,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; -- cgit v1.2.3 From d9abbf1c3131b679379762700201ae69367f3f62 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 20 Mar 2013 13:49:48 +0000 Subject: Btrfs: fix locking on ROOT_REPLACE operations in tree mod log To resolve backrefs, ROOT_REPLACE operations in the tree mod log are required to be tied to at least one KEY_REMOVE_WHILE_FREEING operation. Therefore, those operations must be enclosed by tree_mod_log_write_lock() and tree_mod_log_write_unlock() calls. Those calls are private to the tree_mod_log_* functions, which means that removal of the elements of an old root node must be logged from tree_mod_log_insert_root. This partly reverts and corrects commit ba1bfbd5 (Btrfs: fix a tree mod logging issue for root replacement operations). This fixes the brand-new version of xfstest 276 as of commit cfe73f71. Cc: stable@vger.kernel.org Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e51..ca9d8f1a3bb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), -- cgit v1.2.3 From 1dd05682b3ef6e70409e130bfd83e91770801589 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 21 Mar 2013 04:32:32 +0000 Subject: Btrfs: fix memory leak in btrfs_create_tree() We should free leaf and root before returning from the error handling code. Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 127b23e8323b..6d19a0a554aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 51f0885e5415b4cc6535e9cdcc5145bfbc134353 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Mar 2013 11:44:04 -0700 Subject: vfs,proc: guarantee unique inodes in /proc Dave Jones found another /proc issue with his Trinity tool: thanks to the namespace model, we can have multiple /proc dentries that point to the same inode, aliasing directories in /proc//net/ for example. This ends up being a total disaster, because it acts like hardlinked directories, and causes locking problems. We rely on the topological sort of the inodes pointed to by dentries, and if we have aliased directories, that odering becomes unreliable. In short: don't do this. Multiple dentries with the same (directory) inode is just a bad idea, and the namespace code should never have exposed things this way. But we're kind of stuck with it. This solves things by just always allocating a new inode during /proc dentry lookup, instead of using "iget_locked()" to look up existing inodes by superblock and number. That actually simplies the code a bit, at the cost of potentially doing more inode [de]allocations. That said, the inode lookup wasn't free either (and did a lot of locking of inodes), so it is probably not that noticeable. We could easily keep the old lookup model for non-directory entries, but rather than try to be excessively clever this just implements the minimal and simplest workaround for the problem. Reported-and-tested-by: Dave Jones Analyzed-by: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7c..869116c2afbe 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -446,9 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = iget_locked(sb, de->low_ino); + struct inode *inode = new_inode_pseudo(sb); - if (inode && (inode->i_state & I_NEW)) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -476,7 +477,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; -- cgit v1.2.3 From e49dbbf3e770aa590a8a464ac4978a09027060b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2013 11:18:24 -0700 Subject: nfsd: fix bad offset use vfs_writev() updates the offset argument - but the code then passes the offset to vfs_fsync_range(). Since offset now points to the offset after what was just written, this is probably not what was intended Introduced by face15025ffdf664de95e86ae831544154d26c9c "nfsd: use vfs_fsync_range(), not O_SYNC, for stable writes". Signed-off-by: Kent Overstreet Cc: Al Viro Cc: "Eric W. Biederman" Cc: stable@vger.kernel.org Reviewed-by: Zach Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0b..2b2e2396a869 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; -- cgit v1.2.3 From 4adaa611020fa6ac65b0ac8db78276af4ec04e63 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Mar 2013 13:07:00 -0400 Subject: Btrfs: fix race between mmap writes and compression Btrfs uses page_mkwrite to ensure stable pages during crc calculations and mmap workloads. We call clear_page_dirty_for_io before we do any crcs, and this forces any application with the file mapped to wait for the crc to finish before it is allowed to change the file. With compression on, the clear_page_dirty_for_io step is happening after we've compressed the pages. This means the applications might be changing the pages while we are compressing them, and some of those modifications might not hit the disk. This commit adds the clear_page_dirty_for_io before compression starts and makes sure to redirty the page if we have to fallback to uncompressed IO as well. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva cc: stable@vger.kernel.org --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 14 ++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af6461..cdee391fc7bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a1985560..258c92156857 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f26888825e2..6a6e13c53086 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; -- cgit v1.2.3 From 7ea600b5314529f9d1b9d6d3c41cb26fce6a7a4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2013 18:25:57 -0400 Subject: Nest rename_lock inside vfsmount_lock ... lest we get livelocks between path_is_under() and d_path() and friends. The thing is, wrt fairness lglocks are more similar to rwsems than to rwlocks; it is possible to have thread B spin on attempt to take lock shared while thread A is already holding it shared, if B is on lower-numbered CPU than A and there's a thread C spinning on attempt to take the same lock exclusive. As the result, we need consistent ordering between vfsmount_lock (lglock) and rename_lock (seq_lock), even though everything that takes both is going to take vfsmount_lock only shared. Spotted-by: Brad Spengler Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba44..e8bc3420d63e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2542,7 +2542,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2572,8 +2571,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2590,7 +2587,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2617,9 +2614,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2636,9 +2635,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2702,11 +2703,13 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2830,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2839,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2859,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: -- cgit v1.2.3 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..a3035223d421 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24f..2b93a9a5a1e6 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d342043..0f1e42884577 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3 From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 03:10:15 -0700 Subject: vfs: Add a mount flag to lock read only bind mounts When a read-only bind mount is copied from mount namespace in a higher privileged user namespace to a mount namespace in a lesser privileged user namespace, it should not be possible to remove the the read-only restriction. Add a MNT_LOCK_READONLY mount flag to indicate that a mount must remain read-only. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ include/linux/mount.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index a3035223d421..8505b5ece5de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191a..73005f9957ea 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,6 +47,8 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_READONLY 0x400000 + struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ -- cgit v1.2.3 From 132c94e31b8bca8ea921f9f96a57d684fa4ae0a9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 04:08:05 -0700 Subject: vfs: Carefully propogate mounts across user namespaces As a matter of policy MNT_READONLY should not be changable if the original mounter had more privileges than creator of the mount namespace. Add the flag CL_UNPRIVILEGED to note when we are copying a mount from a mount namespace that requires more privileges to a mount namespace that requires fewer privileges. When the CL_UNPRIVILEGED flag is set cause clone_mnt to set MNT_NO_REMOUNT if any of the mnt flags that should never be changed are set. This protects both mount propagation and the initial creation of a less privileged mount namespace. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++++- fs/pnode.c | 6 ++++++ fs/pnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 8505b5ece5de..968d4c5eae03 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -2342,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0d..8b29d2164da6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445c..a0493d5ebfbf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.3 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae03..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce009324933..b6b215f13b45 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e42884577..a54f26f82eb2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3 From 3e84f48edfd33b2e209a117c11fb9ce637cc9b67 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Mar 2013 15:20:30 +0000 Subject: vfs/splice: Fix missed checks in new __kernel_write() helper Commit 06ae43f34bcc ("Don't bother with redoing rw_verify_area() from default_file_splice_from()") lost the checks to test existence of the write/aio_write methods. My apologies ;-/ Eventually, we want that in fs/splice.c side of things (no point repeating it for every buffer, after all), but for now this is the obvious minimal fix. Reported-by: Dave Jones Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/read_write.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index f7b5a23b804b..e6ddc8dceb96 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -424,6 +424,9 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t const char __user *p; ssize_t ret; + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + old_fs = get_fs(); set_fs(get_ds()); p = (__force const char __user *)buf; -- cgit v1.2.3 From adaa4b8e4d47eeb114513c2f7a172929154b94bd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Mar 2013 14:30:23 +0000 Subject: Btrfs: fix EIO from btrfs send in is_extent_unchanged for punched holes When you take a snapshot, punch a hole where there has been data, then take another snapshot and try to send an incremental stream, btrfs send would give you EIO. That is because is_extent_unchanged had no support for holes being punched. With this patch, instead of returning EIO we just return 0 (== the extent is not unchanged) and we're good. Signed-off-by: Jan Schmidt Cc: Alexander Block Signed-off-by: Josef Bacik --- fs/btrfs/send.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 68da757615ae..ed897dc11356 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } -- cgit v1.2.3 From f4881bc7a83eff263789dd524b7c269d138d4af5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Mar 2013 16:03:35 -0400 Subject: Btrfs: fix space leak when we fail to reserve metadata space Dave reported a warning when running xfstest 275. We have been leaking delalloc metadata space when our reservations fail. This is because we were improperly calculating how much space to free for our checksum reservations. The problem is we would sometimes free up space that had already been freed in another thread and we would end up with negative usage for the delalloc space. This patch fixes the problem by calculating how much space the other threads would have already freed, and then calculate how much space we need to free had we not done the reservation at all, and then freeing any excess space. This makes xfstests 275 no longer have leaked space. Thanks Cc: stable@vger.kernel.org Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8ff25aedca1..a22b5cc921ad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4815,14 +4815,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); -- cgit v1.2.3 From 6e137ed3f30574f314733d4b7a86ea6523232b14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:26:55 -0400 Subject: Btrfs: fix space accounting for unlink and rename We are way over-reserving for unlink and rename. Rename is just some random huge number and unlink accounts for tree log operations that don't actually happen during unlink, not to mention the tree log doesn't take from the trans block rsv anyway so it's completely useless. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a6e13c53086..8cab424c75f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3693,11 +3693,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8141,7 +8139,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; -- cgit v1.2.3 From db1d607d3ca5cbb283cbb17d648cd7e8dc67cc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:29:11 -0400 Subject: Btrfs: hold the ordered operations mutex when waiting on ordered extents We need to hold the ordered_operations mutex while waiting on ordered extents since we splice and run the ordered extents list. We need to make sure anybody else who wants to wait on ordered extents does actually wait for them to be completed. This will keep us from bailing out of flushing in case somebody is already waiting on ordered extents to complete. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717e..005c45db699e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* -- cgit v1.2.3 From fdf30d1c1b386e1b73116cc7e0fb14e962b763b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:31:45 -0400 Subject: Btrfs: limit the global reserve to 512mb A user reported a problem where he was getting early ENOSPC with hundreds of gigs of free data space and 6 gigs of free metadata space. This is because the global block reserve was taking up the entire free metadata space. This is ridiculous, we have infrastructure in place to throttle if we start using too much of the global reserve, so instead of letting it get this huge just limit it to 512mb so that users can still get work done. This allowed the user to complete his rsync without issues. Thanks Cc: stable@vger.kernel.org Reported-and-tested-by: Stefan Priebe Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a22b5cc921ad..0d8478700d78 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4460,7 +4460,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + -- cgit v1.2.3 From a7975026ff9ddf91ba190ae2b71699dd156395e3 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 25 Mar 2013 11:08:23 +0000 Subject: Btrfs: fix double free in the btrfs_qgroup_account_ref() The function btrfs_find_all_roots is responsible to allocate memory for 'roots' and free it if errors happen,so the caller should not free it again since the work has been done. Besides,'tmp' is allocated after the function btrfs_find_all_roots, so we can return directly if btrfs_find_all_roots() fails. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5471e47d6559..b44124dd2370 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); -- cgit v1.2.3 From 39847c4d3d91f487f9ab3d083ee5d0f8419f105c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:08:20 +0000 Subject: Btrfs: fix wrong reservation of csums We reserve the space for csums only when we write data into a file, in the other cases, such as tree log, log replay, we don't do reservation, so we can use the reservation of the transaction handle just for the former. And for the latter, we should use the tree's own reservation. But the function - btrfs_csum_file_blocks() didn't differentiate between these two types of the cases, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 2 -- fs/btrfs/inode.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3e..b7e529d2860f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +898,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cab424c75f8..b88381582dab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1757,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } -- cgit v1.2.3 From 82d130ff390be67d980d8b6f39e921c0b1d8d8e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:12:15 +0000 Subject: Btrfs: fix wrong return value of btrfs_lookup_csum() If we don't find the expected csum item, but find a csum item which is adjacent to the specified extent, we should return -EFBIG, or we should return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item is not adjacent to the specified extent. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7e529d2860f..c4628a201cb3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); -- cgit v1.2.3 From d8fe29e9dea8d7d61fd140d8779326856478fc62 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 29 Mar 2013 08:09:34 -0600 Subject: Btrfs: don't drop path when printing out tree errors in scrub A user reported a panic where we were panicing somewhere in tree_backref_for_extent from scrub_print_warning. He only captured the trace but looking at scrub_print_warning we drop the path right before we mess with the extent buffer to print out a bunch of stuff, which isn't right. So fix this by dropping the path after we use the eb if we need to. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4ca..85e072b956d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, -- cgit v1.2.3 From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: replace custom worker pool implementation with unbound workqueue Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer --- fs/fs-writeback.c | 102 +++++----------- include/linux/backing-dev.h | 15 +-- include/trace/events/writeback.h | 5 - mm/backing-dev.c | 255 +++++---------------------------------- 4 files changed, 65 insertions(+), 312 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a5ef27f5411a..c3881553f7d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,7 @@ #include #include #include +#include struct page; struct device; @@ -27,7 +28,6 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pending, /* On its way to being activated */ BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ @@ -53,10 +53,8 @@ struct bdi_writeback { unsigned int nr; unsigned long last_old_flush; /* last old data flush */ - unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ + struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_thread(void *data); +void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); @@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct workqueue_struct *bdi_wq; + static inline int wb_has_dirty_io(struct bdi_writeback *wb) { return !list_empty(&wb->b_dirty) || @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } -static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) -{ - return bdi == &default_backing_dev_info; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 6a16fd2e70ed..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ TP_ARGS(bdi, work)) -DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_wake_thread); -DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); -DEFINE_WRITEBACK_EVENT(writeback_thread_start); -DEFINE_WRITEBACK_EVENT(writeback_thread_stop); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 657569b3fcf6..2857d4f6bca4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,6 +37,9 @@ static struct class *bdi_class; DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -255,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); -- cgit v1.2.3