From 55b02d74e126df70fb6b92a0e4e4bed2f0a13fe4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 25 Nov 2005 16:41:33 +1100 Subject: [XFS] Fix potential overflow in xfs_iomap_t delta for very large extents SGI-PV: 945311 SGI-Modid: xfs-linux-melb:xfs-kern:201708a Signed-off-by: Eric Sandeen Signed-off-by: Nathan Scott --- fs/xfs/xfs_iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fcd6d63bb68b..3ce204a524b0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -69,7 +69,7 @@ typedef struct xfs_iomap { xfs_buftarg_t *iomap_target; xfs_off_t iomap_offset; /* offset of mapping, bytes */ xfs_off_t iomap_bsize; /* size of mapping, bytes */ - size_t iomap_delta; /* offset into mapping, bytes */ + xfs_off_t iomap_delta; /* offset into mapping, bytes */ iomap_flags_t iomap_flags; } xfs_iomap_t; -- cgit v1.2.3 From f33c6797bccc695c4c85885f2c676ad4c8fed98d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2005 16:41:47 +1100 Subject: [XFS] handle error returns from freeze_bdev SGI-PV: 945483 SGI-Modid: xfs-linux-melb:xfs-kern:201884a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7ceabd0e2d9d..d1236d6f4045 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -550,7 +550,7 @@ xfs_fs_goingdown( struct vfs *vfsp = XFS_MTOVFS(mp); struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); - if (sb) { + if (sb && !IS_ERR(sb)) { xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); thaw_bdev(sb->s_bdev, sb); } -- cgit v1.2.3 From a4656391b76ed93faed724c5963f033164ee477e Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 25 Nov 2005 16:41:57 +1100 Subject: [XFS] Fix a 32 bit value wraparound when providing a mapping for a large direct write. SGI-PV: 944820 SGI-Modid: xfs-linux-melb:xfs-kern:24351a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_aops.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c6108971b4e6..94d3cdfbf9b8 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -941,13 +941,12 @@ __linvfs_get_block( int retpbbm = 1; int error; - if (blocks) { - offset = blocks << inode->i_blkbits; /* 64 bit goodness */ - size = (ssize_t) min_t(xfs_off_t, offset, LONG_MAX); - } else { - size = 1 << inode->i_blkbits; - } offset = (xfs_off_t)iblock << inode->i_blkbits; + if (blocks) + size = (ssize_t) min_t(xfs_off_t, LONG_MAX, + (xfs_off_t)blocks << inode->i_blkbits); + else + size = 1 << inode->i_blkbits; VOP_BMAP(vp, offset, size, create ? flags : BMAPI_READ, &iomap, &retpbbm, error); @@ -1007,7 +1006,7 @@ __linvfs_get_block( ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); offset = min_t(xfs_off_t, iomap.iomap_bsize - iomap.iomap_delta, - blocks << inode->i_blkbits); + (xfs_off_t)blocks << inode->i_blkbits); bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset); } -- cgit v1.2.3 From 6b2cf618cc8445a03640d1e5e36829352e297017 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Fri, 25 Nov 2005 16:42:13 +1100 Subject: [XFS] Tight loop in xfs_finish_reclaim_all prevented the xfslogd to run its queue of IO completion callbacks, thus creating the deadlock between umount and xfslogd. Breaking the loop solves the problem. SGI-PV: 943821 SGI-Modid: xfs-linux-melb:xfs-kern:202363a Signed-off-by: Felix Blyakher Signed-off-by: Nathan Scott --- fs/xfs/xfs_vnodeops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 7c1f74531463..e03fa2a3d5ed 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3958,8 +3958,9 @@ xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock) } } XFS_MOUNT_IUNLOCK(mp); - xfs_finish_reclaim(ip, noblock, - XFS_IFLUSH_DELWRI_ELSE_ASYNC); + if (xfs_finish_reclaim(ip, noblock, + XFS_IFLUSH_DELWRI_ELSE_ASYNC)) + delay(1); purged = 1; break; } -- cgit v1.2.3 From e0144ca55391556d781cb1c90fd6f00bb0c20760 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 25 Nov 2005 16:42:22 +1100 Subject: [XFS] Fix a case where attr2 format was being used unconditionally. SGI-PV: 941645 SGI-Modid: xfs-linux-melb:xfs-kern:24566a Signed-off-by: Nathan Scott --- fs/xfs/xfs_attr_leaf.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 35e557b00db2..1c7421840c18 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -310,7 +310,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname) { + if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && + !(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) { /* * Last attribute now removed, revert to original * inode format making all literal area available @@ -328,7 +329,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || + (mp->m_flags & XFS_MOUNT_COMPAT_ATTR)); dp->i_afp->if_ext_max = XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); dp->i_df.if_ext_max = @@ -737,7 +739,8 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) + name_loc->namelen + INT_GET(name_loc->valuelen, ARCH_CONVERT); } - if (bytes == sizeof(struct xfs_attr_sf_hdr)) + if (!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR) && + (bytes == sizeof(struct xfs_attr_sf_hdr))) return(-1); return(xfs_attr_shortform_bytesfit(dp, bytes)); } @@ -775,6 +778,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) goto out; if (forkoff == -1) { + ASSERT(!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR)); + /* * Last attribute was removed, revert to original * inode format making all literal area available -- cgit v1.2.3 From 551c81e2d32c5867fb592091365d8c37e1509dce Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 25 Nov 2005 16:42:28 +1100 Subject: [XFS] Resolve the xlog_grant_log_space hang, revert inline to macro. SGI-PV: 946205 SGI-Modid: xfs-linux-melb:xfs-kern:24567a Signed-off-by: Nathan Scott --- fs/xfs/xfs_log_priv.h | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8f285149681f..4518b188ade6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -494,10 +494,8 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) -#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \ - xlog_grant_sub_space(log,bytes,type) -static inline void xlog_grant_sub_space(struct log *log, int bytes, int type) -{ +#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \ + { \ if (type == 'w') { \ (log)->l_grant_write_bytes -= (bytes); \ if ((log)->l_grant_write_bytes < 0) { \ @@ -511,13 +509,9 @@ static inline void xlog_grant_sub_space(struct log *log, int bytes, int type) (log)->l_grant_reserve_cycle--; \ } \ } \ -} - -#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \ - xlog_grant_add_space(log,bytes,type) -static inline void -xlog_grant_add_space(struct log *log, int bytes, int type) -{ + } +#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \ + { \ if (type == 'w') { \ (log)->l_grant_write_bytes += (bytes); \ if ((log)->l_grant_write_bytes > (log)->l_logsize) { \ @@ -531,12 +525,9 @@ xlog_grant_add_space(struct log *log, int bytes, int type) (log)->l_grant_reserve_cycle++; \ } \ } \ -} - -#define XLOG_INS_TICKETQ(q, tic) xlog_ins_ticketq(q, tic) -static inline void -xlog_ins_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic) -{ \ + } +#define XLOG_INS_TICKETQ(q, tic) \ + { \ if (q) { \ (tic)->t_next = (q); \ (tic)->t_prev = (q)->t_prev; \ @@ -547,12 +538,9 @@ xlog_ins_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic) (q) = (tic); \ } \ (tic)->t_flags |= XLOG_TIC_IN_Q; \ -} - -#define XLOG_DEL_TICKETQ(q, tic) xlog_del_ticketq(q, tic) -static inline void -xlog_del_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic) -{ \ + } +#define XLOG_DEL_TICKETQ(q, tic) \ + { \ if ((tic) == (tic)->t_next) { \ (q) = NULL; \ } else { \ @@ -562,7 +550,7 @@ xlog_del_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic) } \ (tic)->t_next = (tic)->t_prev = NULL; \ (tic)->t_flags &= ~XLOG_TIC_IN_Q; \ -} + } /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); -- cgit v1.2.3 From 36f20c6df75d599393d79c7feb6283b20913e3d5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Nov 2005 17:09:57 -0500 Subject: NFSv4: Fix buggy nfs_wait_on_sequence() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0675f3215e0a..5ef4c57618fe 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -644,12 +644,15 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) { + struct rpc_sequence *sequence = counter->sequence; struct nfs_seqid *new; new = kmalloc(sizeof(*new), GFP_KERNEL); if (new != NULL) { new->sequence = counter; - INIT_LIST_HEAD(&new->list); + spin_lock(&sequence->lock); + list_add_tail(&new->list, &sequence->list); + spin_unlock(&sequence->lock); } return new; } @@ -658,12 +661,10 @@ void nfs_free_seqid(struct nfs_seqid *seqid) { struct rpc_sequence *sequence = seqid->sequence->sequence; - if (!list_empty(&seqid->list)) { - spin_lock(&sequence->lock); - list_del(&seqid->list); - spin_unlock(&sequence->lock); - } - rpc_wake_up_next(&sequence->wait); + spin_lock(&sequence->lock); + list_del(&seqid->list); + spin_unlock(&sequence->lock); + rpc_wake_up(&sequence->wait); kfree(seqid); } @@ -722,11 +723,10 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) if (sequence->list.next == &seqid->list) goto out; spin_lock(&sequence->lock); - if (!list_empty(&sequence->list)) { + if (sequence->list.next != &seqid->list) { rpc_sleep_on(&sequence->wait, task, NULL, NULL); status = -EAGAIN; - } else - list_add(&seqid->list, &sequence->list); + } spin_unlock(&sequence->lock); out: return status; -- cgit v1.2.3 From ff6040667ad5a21fa1090e02941ecefb94ebe32c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Nov 2005 17:10:01 -0500 Subject: NFSv4: Fix typo in lock caching When caching locks due to holding a file delegation, we must always check against local locks before sending anything to the server. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 21482b2518f6..60e0dd800cc3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3071,15 +3071,15 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs4_client *clp = state->owner->so_client; int status; - down_read(&clp->cl_sem); /* Is this a delegated open? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + if (NFS_I(state->inode)->delegation_state != 0) { /* Yes: cache locks! */ status = do_vfs_lock(request->fl_file, request); /* ...but avoid races with delegation recall... */ if (status < 0 || test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; + return status; } + down_read(&clp->cl_sem); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; -- cgit v1.2.3 From b37b03b7051493c9f9a6b336c9c0f81334885b7d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Nov 2005 17:10:06 -0500 Subject: NFS: Fix a spinlock recursion inside nfs_update_inode() In cases where the server has gone insane, nfs_update_inode() may end up calling nfs_invalidate_inode(), which again calls stuff that takes the inode->i_lock that we're already holding. In addition, given the sort of things we have in NFS these days that need to be cleaned up on inode release, I'm not sure we should ever be calling make_bad_inode(). Fix up spinlock recursion, and limit nfs_invalidate_inode() to clearing the caches, and marking the inode as being stale. Thanks to Steve Dickson for spotting this. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6391d8964214..aaab1a5ac461 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -643,14 +643,11 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) /* * Invalidate the local caches */ -void -nfs_zap_caches(struct inode *inode) +static void nfs_zap_caches_locked(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; - spin_lock(&inode->i_lock); - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; @@ -659,7 +656,12 @@ nfs_zap_caches(struct inode *inode) nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; else nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; +} +void nfs_zap_caches(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } @@ -676,16 +678,13 @@ static void nfs_zap_acl_cache(struct inode *inode) } /* - * Invalidate, but do not unhash, the inode + * Invalidate, but do not unhash, the inode. + * NB: must be called with inode->i_lock held! */ -static void -nfs_invalidate_inode(struct inode *inode) +static void nfs_invalidate_inode(struct inode *inode) { - umode_t save_mode = inode->i_mode; - - make_bad_inode(inode); - inode->i_mode = save_mode; - nfs_zap_caches(inode); + set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); + nfs_zap_caches_locked(inode); } struct nfs_find_desc { @@ -1528,14 +1527,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", __FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode); #endif + out_err: /* * No need to worry about unhashing the dentry, as the * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ nfs_invalidate_inode(inode); - out_err: - set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); return -ESTALE; } -- cgit v1.2.3 From 3abb92722ab1784b419dadb5444daf8ea9636905 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 28 Nov 2005 08:16:13 -0800 Subject: [CIFS] When file is deleted locally but later recreated on the server fix cifs negative dentries so they are freed faster (not requiring umount or readdir e.g.) so the client recognizes the new file on the server more quickly. Signed-off-by: Steve French --- fs/cifs/CHANGES | 8 +++++--- fs/cifs/dir.c | 25 ++++++++++++++++--------- fs/cifs/inode.c | 22 ++++++++++++++-------- 3 files changed, 35 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 6bded10c0d50..c40bd0df80ad 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,10 +1,12 @@ Version 1.39 ------------ -Defer close of a file handle slightly if pending writes depend on that file handle +Defer close of a file handle slightly if pending writes depend on that handle (this reduces the EBADF bad file handle errors that can be logged under heavy stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 -Fix SFU style symlinks and mknod needed for servers which do not support the CIFS -Unix Extensions. Fix setfacl/getfacl on bigendian. +Fix SFU style symlinks and mknod needed for servers which do not support the +CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative +dentries so files that the client sees as deleted but that later get created +on the server will be recognized. Version 1.38 ------------ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 16b21522e8fe..aa4ea965b329 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -465,12 +465,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); - /* since paths are not looked up by component - the parent directories are presumed to be good here */ + /* since paths are not looked up by component - the parent + directories are presumed to be good here */ renew_parental_timestamps(direntry); } else if (rc == -ENOENT) { rc = 0; + direntry->d_time = jiffies; + if (pTcon->nocase) + direntry->d_op = &cifs_ci_dentry_ops; + else + direntry->d_op = &cifs_dentry_ops; d_add(direntry, NULL); + /* if it was once a directory (but how can we tell?) we could do + shrink_dcache_parent(direntry); */ } else { cERROR(1,("Error 0x%x on cifs_get_inode_info in lookup of %s", rc,full_path)); @@ -489,21 +497,20 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { int isValid = 1; -/* lock_kernel(); *//* surely we do not want to lock the kernel for a whole network round trip which could take seconds */ - if (direntry->d_inode) { if (cifs_revalidate(direntry)) { - /* unlock_kernel(); */ return 0; } } else { - cFYI(1, - ("In cifs_d_revalidate with no inode but name = %s and dentry 0x%p", - direntry->d_name.name, direntry)); + cFYI(1, ("neg dentry 0x%p name = %s", + direntry, direntry->d_name.name)); + if(time_after(jiffies, direntry->d_time + HZ) || + !lookupCacheEnabled) { + d_drop(direntry); + isValid = 0; + } } -/* unlock_kernel(); */ - return isValid; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 05b525812adb..d34325c887c4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1039,14 +1039,20 @@ int cifs_revalidate(struct dentry *direntry) filemap_fdatawrite(direntry->d_inode->i_mapping); } if (invalidate_inode) { - if (direntry->d_inode->i_mapping) - filemap_fdatawait(direntry->d_inode->i_mapping); - /* may eventually have to do this for open files too */ - if (list_empty(&(cifsInode->openFileList))) { - /* Has changed on server - flush read ahead pages */ - cFYI(1, ("Invalidating read ahead data on " - "closed file")); - invalidate_remote_inode(direntry->d_inode); + /* shrink_dcache not necessary now that cifs dentry ops + are exported for negative dentries */ +/* if(S_ISDIR(direntry->d_inode->i_mode)) + shrink_dcache_parent(direntry); */ + if (S_ISREG(direntry->d_inode->i_mode)) { + if (direntry->d_inode->i_mapping) + filemap_fdatawait(direntry->d_inode->i_mapping); + /* may eventually have to do this for open files too */ + if (list_empty(&(cifsInode->openFileList))) { + /* changed on server - flush read ahead pages */ + cFYI(1, ("Invalidating read ahead data on " + "closed file")); + invalidate_remote_inode(direntry->d_inode); + } } } /* up(&direntry->d_inode->i_sem); */ -- cgit v1.2.3 From 6aab341e0a28aff100a09831c5300a2994b8b986 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Nov 2005 14:34:23 -0800 Subject: mm: re-architect the VM_UNPAGED logic This replaces the (in my opinion horrible) VM_UNMAPPED logic with very explicit support for a "remapped page range" aka VM_PFNMAP. It allows a VM area to contain an arbitrary range of page table entries that the VM never touches, and never considers to be normal pages. Any user of "remap_pfn_range()" automatically gets this new functionality, and doesn't even have to mark the pages reserved or indeed mark them any other way. It just works. As a side effect, doing mmap() on /dev/mem works for arbitrary ranges. Sparc update from David in the next commit. Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/vdso.c | 6 +- drivers/char/mem.c | 2 +- fs/proc/task_mmu.c | 7 +- include/linux/mm.h | 5 +- mm/fremap.c | 22 ++---- mm/madvise.c | 2 +- mm/memory.c | 189 ++++++++++++++++++++++++--------------------- mm/mempolicy.c | 12 +-- mm/msync.c | 12 +-- mm/nommu.c | 2 +- mm/rmap.c | 14 +--- 11 files changed, 127 insertions(+), 146 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index b44b36e0c293..f0c47dab0903 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); struct page *upg = (vma && vma->vm_mm) ? - follow_page(vma->vm_mm, vma->vm_start + - i*PAGE_SIZE, 0) + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) : NULL; dump_one_vdso_page(pg, upg); } @@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); struct page *upg = (vma && vma->vm_mm) ? - follow_page(vma->vm_mm, vma->vm_start + - i*PAGE_SIZE, 0) + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) : NULL; dump_one_vdso_page(pg, upg); } diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 29c3b631445a..91dd669273e0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size) if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) goto out_up; - if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED)) + if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) break; count = vma->vm_end - addr; if (count > size) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9ab97cef0daa..50bd5a8f0446 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -402,12 +402,11 @@ struct numa_maps { /* * Calculate numa node maps for a vma */ -static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) +static struct numa_maps *get_numa_maps(struct vm_area_struct *vma) { + int i; struct page *page; unsigned long vaddr; - struct mm_struct *mm = vma->vm_mm; - int i; struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) @@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) md->node[i] =0; for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { - page = follow_page(mm, vaddr, 0); + page = follow_page(vma, vaddr, 0); if (page) { int count = page_mapcount(page); diff --git a/include/linux/mm.h b/include/linux/mm.h index f0cdfd18db55..6a75a7a78bf1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_GROWSUP 0x00000200 #define VM_SHM 0x00000000 /* Means nothing: delete it later */ -#define VM_UNPAGED 0x00000400 /* Pages managed without map count */ +#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_EXECUTABLE 0x00001000 @@ -664,6 +664,7 @@ struct zap_details { unsigned long truncate_count; /* Compare vm_truncate_count */ }; +struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); unsigned long unmap_vmas(struct mmu_gather **tlb, @@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); -struct page *follow_page(struct mm_struct *, unsigned long address, +struct page *follow_page(struct vm_area_struct *, unsigned long address, unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ diff --git a/mm/fremap.c b/mm/fremap.c index 007cbad9331e..f851775e09c2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; if (pte_present(pte)) { - unsigned long pfn = pte_pfn(pte); - flush_cache_page(vma, addr, pfn); + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out; + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); } - page = pfn_to_page(pfn); - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } -out: return !!page; } @@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_UNPAGED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) @@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_UNPAGED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) diff --git a/mm/madvise.c b/mm/madvise.c index 328a3bcce527..2b7cf0400a21 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/mm/memory.c b/mm/memory.c index d1f46f4e4c8a..b57fbc636058 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) } /* - * This function is called to print an error when a pte in a - * !VM_UNPAGED region is found pointing to an invalid pfn (which - * is an error. + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. * * The calling function must still handle the error. */ @@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) } /* - * page_is_anon applies strict checks for an anonymous page belonging to - * this vma at this address. It is used on VM_UNPAGED vmas, which are - * usually populated with shared originals (which must not be counted), - * but occasionally contain private COWed copies (when !VM_SHARED, or - * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window - * free pages, pages from other processes, or from other parts of this: - * it's tricky, but try not to be deceived by foreign anonymous pages. + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). */ -static inline int page_is_anon(struct page *page, - struct vm_area_struct *vma, unsigned long addr) +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - return page && PageAnon(page) && page_mapped(page) && - page_address_in_vma(page, vma) == addr; + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); } /* @@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; - unsigned long pfn; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_set_pte; } - pfn = pte_pfn(pte); - page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; - - if (unlikely(vm_flags & VM_UNPAGED)) - if (!page_is_anon(page, vma, addr)) - goto out_set_pte; - - /* - * If the pte points outside of valid memory but - * the region is not VM_UNPAGED, we have a problem. - */ - if (unlikely(!page)) { - print_bad_pte(vma, pte, addr); - goto out_set_pte; /* try to do something sane */ - } - /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - get_page(page); - page_dup_rmap(page); - rss[!!PageAnon(page)]++; + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); @@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { if (!vma->anon_vma) return 0; } @@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } if (pte_present(ptent)) { struct page *page; - unsigned long pfn; (*zap_work) -= PAGE_SIZE; - pfn = pte_pfn(ptent); - page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; - - if (unlikely(vma->vm_flags & VM_UNPAGED)) { - if (!page_is_anon(page, vma, addr)) - page = NULL; - } else if (unlikely(!page)) - print_bad_pte(vma, ptent, addr); - + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; @@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long pfn; struct page *page; + struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { @@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, goto unlock; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - pfn = pte_pfn(pte); - if (!pfn_valid(pfn)) + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) goto unlock; - page = pfn_to_page(pfn); if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { @@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + struct page *page = vm_normal_page(vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); } pte_unmap(pte); if (vmas) @@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags |= FOLL_WRITE; cond_resched(); - while (!(page = follow_page(mm, start, foll_flags))) { + while (!(page = follow_page(vma, start, foll_flags))) { int ret; ret = __handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); @@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * in 2.6 the LRU scan won't even find its pages, so this * flag means no more than count its pages in reserved_vm, * and omit it from core dump, even when VM_IO turned off. - * VM_UNPAGED tells the core MM not to "manage" these pages - * (e.g. refcount, mapcount, try to swap them out): in - * particular, zap_pte_range does not try to free them. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. */ - vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_pgoff = pfn; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE); + if (left) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *src_page, *new_page; - unsigned long pfn = pte_pfn(orig_pte); pte_t entry; int ret = VM_FAULT_MINOR; - if (unlikely(!pfn_valid(pfn))) { - /* - * Page table corrupted: show pte and kill process. - * Or it's an attempt to COW an out-of-map VM_UNPAGED - * entry, which copy_user_highpage does not support. - */ - print_bad_pte(vma, orig_pte, address); - ret = VM_FAULT_OOM; - goto unlock; - } - old_page = pfn_to_page(pfn); + old_page = vm_normal_page(vma, address, orig_pte); src_page = old_page; - - if (unlikely(vma->vm_flags & VM_UNPAGED)) - if (!page_is_anon(old_page, vma, address)) { - old_page = NULL; - goto gotten; - } + if (!old_page) + goto gotten; if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); @@ -1351,7 +1373,7 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - copy_user_highpage(new_page, src_page, address); + cow_user_page(new_page, src_page, address); } /* @@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; - /* - * A VM_UNPAGED vma will normally be filled with present ptes - * by remap_pfn_range, and never arrive here; but it might have - * holes, or if !VM_DONTEXPAND, mremap might have expanded it. - * It's weird enough handling anon pages in unpaged vmas, we do - * not want to worry about ZERO_PAGEs too (it may or may not - * matter if their counts wrap): just give them anon pages. - */ - - if (write_access || (vma->vm_flags & VM_UNPAGED)) { + if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); @@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int anon = 0; pte_unmap(page_table); - BUG_ON(vma->vm_flags & VM_UNPAGED); - if (vma->vm_file) { mapping = vma->vm_file->f_mapping; sequence = mapping->truncate_count; @@ -1930,7 +1941,7 @@ retry: page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; - copy_user_highpage(page, new_page, address); + cow_user_page(page, new_page, address); page_cache_release(new_page); new_page = page; anon = 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5609a31bdf22..bec88c81244e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; + struct page *page; unsigned int nid; if (!pte_present(*pte)) continue; - pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - nid = pfn_to_nid(pfn); + nid = page_to_nid(page); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); - if (first->vm_flags & VM_UNPAGED) - return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) diff --git a/mm/msync.c b/mm/msync.c index b3f4caf3010b..1b5b6f662dcf 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; struct page *page; if (progress >= 64) { @@ -40,13 +39,9 @@ again: continue; if (!pte_maybe_dirty(*pte)) continue; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - page = pfn_to_page(pfn); - if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); @@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). - * Can't do anything with VM_UNPAGED regions either. */ - if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) + if (vma->vm_flags & VM_HUGETLB) return; BUG_ON(addr >= end); diff --git a/mm/nommu.c b/mm/nommu.c index 6deb6ab3d6ad..c1196812876b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; diff --git a/mm/rmap.c b/mm/rmap.c index 2e034a0b89ab..6389cda02a20 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* * At what user virtual address is page expected in vma? checking that the * page matches the vma: currently only used on anon pages, by unuse_vma; - * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking - * care that an mmap of /dev/mem might window free and foreign pages. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; - unsigned long pfn; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor, for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; - - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, address); - continue; - } - - page = pfn_to_page(pfn); - BUG_ON(PageAnon(page)); + page = vm_normal_page(vma, address, *pte); + BUG_ON(!page || PageAnon(page)); if (ptep_clear_flush_young(vma, address, pte)) continue; -- cgit v1.2.3 From 7729ac5efe156129d172784fedeaddb2167a1914 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Mon, 28 Nov 2005 13:43:53 -0800 Subject: [PATCH] reiserfs: fix 32-bit overflow in map_block_for_writepage() I now see another overflow in reiserfs that should lead to data corruptions with files that are bigger than 4G under certain circumstances when using mmap. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 5f82352b97e1..0a044ad98885 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2194,7 +2194,7 @@ static int map_block_for_writepage(struct inode *inode, INITIALIZE_PATH(path); int pos_in_item; int jbegin_count = JOURNAL_PER_BALANCE_CNT; - loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1; + loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; int retval; int use_get_block = 0; int bytes_copied = 0; -- cgit v1.2.3 From aa877b3dc9f2a1fdffac4ea36bee97c21db11a69 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Mon, 28 Nov 2005 13:44:02 -0800 Subject: [PATCH] ext3: Wrong return value for EXT3_IOC_GROUP_ADD This patch corrects the return value for the EXT3_IOC_GROUP_ADD in case it fails due to the presence of multiple resizers at the filesystem. The problem is a little bit more serious than a wrong return value in this case, since the clause err=0 in the exit_journal path will lead to a call to update_backups which in turns causes a NULL pointer dereference. Signed-off-by: Glauber de Oliveira Costa Cc: "Stephen C. Tweedie" Cc: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1be78b4b4de9..6104ad310507 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (input->group != EXT3_SB(sb)->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!\n"); + err = -EBUSY; goto exit_journal; } -- cgit v1.2.3 From a93a117eaa0bec426d4671a49bfa96a6fdcd2ac9 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Mon, 28 Nov 2005 13:44:05 -0800 Subject: [PATCH] v9fs: fix memory leak in v9fs dentry code Assign the appropriate dentry operations to the dentry. Fixes memory leak. Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index be7288184fa9..0ea965c3bb7d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -427,6 +427,8 @@ v9fs_create(struct inode *dir, v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb); kfree(fcall); + fcall = NULL; + file_dentry->d_op = &v9fs_dentry_operations; d_instantiate(file_dentry, file_inode); if (perm & V9FS_DMDIR) { -- cgit v1.2.3 From 154f484b92e5c25c400f6903512c511644a49322 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 28 Nov 2005 13:44:14 -0800 Subject: [PATCH] Fix oops in vfs_quotaon_mount() When quota file specified in mount options did not exist, we tried to dereference NULL pointer later. Fix it. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dquot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/dquot.c b/fs/dquot.c index 05b60283c9c2..2a62b3dc20ec 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -1513,10 +1513,16 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, if (IS_ERR(dentry)) return PTR_ERR(dentry); + if (!dentry->d_inode) { + error = -ENOENT; + goto out; + } + error = security_quota_on(dentry); if (!error) error = vfs_quota_on_inode(dentry->d_inode, type, format_id); +out: dput(dentry); return error; } -- cgit v1.2.3 From f007d5c961448170d0ec2998b1a80eef054b6235 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Nov 2005 13:44:16 -0800 Subject: [PATCH] fuse: check directory aliasing in mkdir Check the created directory inode for aliases in the mkdir() method. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c045cc70c749..3a47247a889e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -74,6 +74,19 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 1; } +static int dir_alias(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) { + /* Don't allow creating an alias to a directory */ + struct dentry *alias = d_find_alias(inode); + if (alias) { + dput(alias); + return 1; + } + } + return 0; +} + static struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, }; @@ -263,7 +276,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, fuse_put_request(fc, req); /* Don't allow userspace to do really stupid things... */ - if ((inode->i_mode ^ mode) & S_IFMT) { + if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) { iput(inode); return -EIO; } @@ -874,14 +887,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, err = fuse_lookup_iget(dir, entry, &inode); if (err) return ERR_PTR(err); - if (inode && S_ISDIR(inode->i_mode)) { - /* Don't allow creating an alias to a directory */ - struct dentry *alias = d_find_alias(inode); - if (alias) { - dput(alias); - iput(inode); - return ERR_PTR(-EIO); - } + if (inode && dir_alias(inode)) { + iput(inode); + return ERR_PTR(-EIO); } d_add(entry, inode); return NULL; -- cgit v1.2.3 From 2827d0b23b7279d0a717eea4029efeef2e1b0183 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Nov 2005 13:44:16 -0800 Subject: [PATCH] fuse: check for invalid node ID in fuse_create_open() Check for invalid node ID values in the new atomic create+open method. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3a47247a889e..51f5da652771 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -87,6 +87,11 @@ static int dir_alias(struct inode *inode) return 0; } +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + static struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, }; @@ -110,7 +115,7 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry, fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); err = req->out.h.error; - if (!err && (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID)) + if (!err && invalid_nodeid(outarg.nodeid)) err = -EIO; if (!err) { inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, @@ -206,7 +211,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, } err = -EIO; - if (!S_ISREG(outentry.attr.mode)) + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) goto out_free_ff; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, @@ -263,7 +268,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, fuse_put_request(fc, req); return err; } - if (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID) { + if (invalid_nodeid(outarg.nodeid)) { fuse_put_request(fc, req); return -EIO; } -- cgit v1.2.3 From 21eeb7aa116b1f59fc23339521173cbb13e57f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Nov 2005 16:57:17 +0100 Subject: [JFFS2] Fix the slab cache constructor of 'struct jffs2_inode_info' objects. JFFS2 initialize f->sem mutex as "locked" in the slab constructor which is a bug. Objects are freed with unlocked f->sem mutex. So, when they allocated again, f->sem is unlocked because the slab cache constructor is not called for them. The constructor is called only once when memory pages are allocated for objects (namely, when the slab layer allocates new slabs). So, sometimes 'struct jffs2_inode_info' are allocated with unlocked f->sem, sometimes with locked. This is a bug. Instead, initialize f->sem as unlocked in the constructor. I.e., in the "constructed" state f->sem must be unlocked. From: Keijiro Yano Acked-by: Artem B. Bityutskiy Signed-off-by: Thomas Gleixner --- fs/jffs2/fs.c | 2 ++ fs/jffs2/super.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 543420665c5b..d0fcc5f3497e 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -234,6 +234,7 @@ void jffs2_read_inode (struct inode *inode) c = JFFS2_SB_INFO(inode->i_sb); jffs2_init_inode_info(f); + down(&f->sem); ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); @@ -400,6 +401,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i f = JFFS2_INODE_INFO(inode); jffs2_init_inode_info(f); + down(&f->sem); memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 9e0b5458d9c0..93883817cbd0 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -51,7 +51,7 @@ static void jffs2_i_init_once(void * foo, kmem_cache_t * cachep, unsigned long f if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { - init_MUTEX_LOCKED(&ei->sem); + init_MUTEX(&ei->sem); inode_init_once(&ei->vfs_inode); } } -- cgit v1.2.3 From c9cfcddfd65735437a4cb8563d6b66a6da8a5ed6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Nov 2005 14:03:14 -0800 Subject: VM: add common helper function to create the page tables This logic was duplicated four times, for no good reason. Signed-off-by: Linus Torvalds --- fs/exec.c | 12 +----------- include/linux/mm.h | 2 ++ mm/fremap.c | 24 ++---------------------- mm/memory.c | 26 ++++++++++++++------------ 4 files changed, 19 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1f8a9fd2c9ed..22533cce0611 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -306,9 +306,6 @@ void install_arg_page(struct vm_area_struct *vma, struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - pgd_t * pgd; - pud_t * pud; - pmd_t * pmd; pte_t * pte; spinlock_t *ptl; @@ -316,14 +313,7 @@ void install_arg_page(struct vm_area_struct *vma, goto out; flush_dcache_page(page); - pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, address, &ptl); + pte = get_locked_pte(mm, address, &ptl); if (!pte) goto out; if (!pte_none(*pte)) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 74f90d7eb5ef..0e73f1539d08 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,8 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); +extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); + int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); diff --git a/mm/fremap.c b/mm/fremap.c index f851775e09c2..9f381e58bf44 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -55,20 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t size; int err = -ENOMEM; pte_t *pte; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; pte_t pte_val; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; @@ -110,20 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, { int err = -ENOMEM; pte_t *pte; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; pte_t pte_val; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; diff --git a/mm/memory.c b/mm/memory.c index 990e7dc666f8..74f95ae0510b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1146,6 +1146,18 @@ int zeromap_page_range(struct vm_area_struct *vma, return err; } +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pgd, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + /* * This is the old fallback for page remapping. * @@ -1156,10 +1168,7 @@ int zeromap_page_range(struct vm_area_struct *vma, static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) { int retval; - pgd_t * pgd; - pud_t * pud; - pmd_t * pmd; - pte_t * pte; + pte_t *pte; spinlock_t *ptl; retval = -EINVAL; @@ -1167,14 +1176,7 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa goto out; retval = -ENOMEM; flush_dcache_page(page); - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = -EBUSY; -- cgit v1.2.3 From 576f6d79564d0d2c1f43088e6805674d2e122935 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 29 Nov 2005 19:34:39 -0800 Subject: [PATCH] reiserfs: handle cnode allocation failure gracefully If an external device is used for a journal, by default it will use the entire device. The reiserfs journal code allocates structures per journal block when it mounts the file system. If the journal device is too large, and memory cannot be allocated for the structures, it will continue and ultimately panic when it can't pull one off the free list. This patch handles the allocation failure gracefully and prints an error message at mount time. Changes: Updated error message to be more descriptive to the user. Discussed and approved on ReiserFS Mailing List, Nov 28. Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4b15761434bc..68b7b78638ff 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2757,6 +2757,15 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name, journal->j_cnode_used = 0; journal->j_must_wait = 0; + if (journal->j_cnode_free == 0) { + reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory " + "allocation failed (%ld bytes). Journal is " + "too large for available memory. Usually " + "this is due to a journal that is too large.", + sizeof (struct reiserfs_journal_cnode) * num_cnodes); + goto free_and_return; + } + init_journal_hash(p_s_sb); jl = journal->j_current_jl; jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); -- cgit v1.2.3 From b0b623c3b22d57d6941b200321779d56c4e79e6b Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Tue, 29 Nov 2005 19:34:41 -0800 Subject: [PATCH] hfsplus: don't modify journaled volume Access to a journaled HFS+ volume is not officially supported under Linux, so mount such a volume read-only, but users can override this behaviour using the "force" mount option. The minimum requirement to relax this check is to at least check that the journal is empty and so nothing needs to be replayed to make sure the volume is consistent. Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/hfsplus_fs.h | 1 + fs/hfsplus/hfsplus_raw.h | 12 +++++++----- fs/hfsplus/options.c | 6 +++++- fs/hfsplus/super.c | 20 ++++++++++++++++++++ 4 files changed, 33 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index c60e5635498d..df16fcbff3fb 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -151,6 +151,7 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_WRITEBACKUP 0x0001 #define HFSPLUS_SB_NODECOMPOSE 0x0002 +#define HFSPLUS_SB_FORCE 0x0004 struct hfsplus_inode_info { diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 5bad37cfdb29..b4fbed633219 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -123,11 +123,13 @@ struct hfsplus_vh { } __packed; /* HFS+ volume attributes */ -#define HFSPLUS_VOL_UNMNT (1 << 8) -#define HFSPLUS_VOL_SPARE_BLK (1 << 9) -#define HFSPLUS_VOL_NOCACHE (1 << 10) -#define HFSPLUS_VOL_INCNSTNT (1 << 11) -#define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNMNT (1 << 8) +#define HFSPLUS_VOL_SPARE_BLK (1 << 9) +#define HFSPLUS_VOL_NOCACHE (1 << 10) +#define HFSPLUS_VOL_INCNSTNT (1 << 11) +#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) +#define HFSPLUS_VOL_JOURNALED (1 << 13) +#define HFSPLUS_VOL_SOFTLOCK (1 << 15) /* HFS+ BTree node descriptor */ struct hfs_bnode_desc { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index cca0818aa4ca..935dafba0078 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -22,7 +22,7 @@ enum { opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, opt_nodecompose, opt_decompose, - opt_err + opt_force, opt_err }; static match_table_t tokens = { @@ -36,6 +36,7 @@ static match_table_t tokens = { { opt_nls, "nls=%s" }, { opt_decompose, "decompose" }, { opt_nodecompose, "nodecompose" }, + { opt_force, "force" }, { opt_err, NULL } }; @@ -145,6 +146,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) case opt_nodecompose: sbi->flags |= HFSPLUS_SB_NODECOMPOSE; break; + case opt_force: + sbi->flags |= HFSPLUS_SB_FORCE; + break; default: return 0; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 0ce1c455ae55..8093351bd7c3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -251,16 +251,28 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return 0; if (!(*flags & MS_RDONLY)) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + struct hfsplus_sb_info sbi; + + memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); + sbi.nls = HFSPLUS_SB(sb).nls; + if (!hfsplus_parse_options(data, &sbi)) + return -EINVAL; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { printk("HFS+-fs warning: Filesystem was not cleanly unmounted, " "running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; + } else if (sbi.flags & HFSPLUS_SB_FORCE) { + /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk("HFS+-fs: Filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; + } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + printk("HFS+-fs: Filesystem is marked journaled, leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; } } return 0; @@ -352,11 +364,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk("HFS+-fs warning: Filesystem was not cleanly unmounted, " "running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; + } else if (sbi->flags & HFSPLUS_SB_FORCE) { + /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { if (!silent) printk("HFS+-fs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; + } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + if (!silent) + printk("HFS+-fs: write access to a jounaled filesystem is not supported, " + "use the force option at your own risk, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; } + sbi->flags &= ~HFSPLUS_SB_FORCE; /* Load metadata objects (B*Trees) */ HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); -- cgit v1.2.3 From 6473a559c336d5c407f9df412ca2f55357767ff8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Nov 2005 20:20:10 -0800 Subject: [CIFS] Fix missing permission check on setattr when noperm mount option is disabled. Also set mode, uid, gid better on mkdir and create for the case when Unix Extensions is not enabled and setuids is enabled. This is necessary to fix the hole in which chown could be allowed for non-root users in some cases if root mounted, and also to display the mode and uid properly in some cases. Signed-off-by: Steve French --- fs/cifs/CHANGES | 2 +- fs/cifs/README | 30 ++++++++++++++++++++++++------ fs/cifs/TODO | 4 ++-- fs/cifs/dir.c | 9 ++++++++- fs/cifs/inode.c | 28 +++++++++++++++++++++++++--- 5 files changed, 60 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index c40bd0df80ad..943ef9b82244 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -6,7 +6,7 @@ stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 Fix SFU style symlinks and mknod needed for servers which do not support the CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative dentries so files that the client sees as deleted but that later get created -on the server will be recognized. +on the server will be recognized. Add client side permission check on setattr. Version 1.38 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index bb90941826ad..e5d09a2fc7a5 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -278,7 +278,9 @@ A partial list of the supported mount options follows: (such as Windows), permissions can also be checked at the client, and a crude form of client side permission checking can be enabled by specifying file_mode and dir_mode on - the client + the client. Note that the mount.cifs helper must be + at version 1.10 or higher to support specifying the uid + (or gid) in non-numberic form. gid If CIFS Unix extensions are not supported by the server this overrides the default gid for inodes. file_mode If CIFS Unix extensions are not supported by the server @@ -345,7 +347,10 @@ A partial list of the supported mount options follows: client system. It is typically only needed when the server supports the CIFS Unix Extensions but the UIDs/GIDs on the client and server system do not match closely enough to allow - access by the user doing the mount. + access by the user doing the mount, but it may be useful with + non CIFS Unix Extension mounts for cases in which the default + mode is specified on the mount but is not to be enforced on the + client (e.g. perhaps when MultiUserMount is enabled) Note that this does not affect the normal ACL check on the target machine done by the server software (of the server ACL against the user name provided at mount time). @@ -368,15 +373,21 @@ A partial list of the supported mount options follows: setuids If the CIFS Unix extensions are negotiated with the server the client will attempt to set the effective uid and gid of the local process on newly created files, directories, and - devices (create, mkdir, mknod). + devices (create, mkdir, mknod). If the CIFS Unix Extensions + are not negotiated, for newly created files and directories + instead of using the default uid and gid specified on the + the mount, cache the new file's uid and gid locally which means + that the uid for the file can change when the inode is + reloaded (or the user remounts the share). nosetuids The client will not attempt to set the uid and gid on on newly created files, directories, and devices (create, mkdir, mknod) which will result in the server setting the uid and gid to the default (usually the server uid of the user who mounted the share). Letting the server (rather than - the client) set the uid and gid is the default. This - parameter has no effect if the CIFS Unix Extensions are not - negotiated. + the client) set the uid and gid is the default. If the CIFS + Unix Extensions are not negotiated then the uid and gid for + new files will appear to be the uid (gid) of the mounter or the + uid (gid) parameter specified on the mount. netbiosname When mounting to servers via port 139, specifies the RFC1001 source name to use to represent the client netbios machine name when doing the RFC1001 netbios session initialize. @@ -418,6 +429,13 @@ A partial list of the supported mount options follows: byte range locks). remount remount the share (often used to change from ro to rw mounts or vice versa) + sfu When the CIFS Unix Extensions are not negotiated, attempt to + create device files and fifos in a format compatible with + Services for Unix (SFU). In addition retrieve bits 10-12 + of the mode via the SETFILEBITS extended attribute (as + SFU does). In the future the bottom 9 bits of the mode + mode also will be emulated using queries of the security + descriptor (ACL). The mount.cifs mount helper also accepts a few mount options before -o including: diff --git a/fs/cifs/TODO b/fs/cifs/TODO index c909298d11ed..fc34c74ec4be 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -1,4 +1,4 @@ -version 1.37 October 9, 2005 +Version 1.39 November 30, 2005 A Partial List of Missing Features ================================== @@ -58,7 +58,7 @@ o) Improve performance of readpages by sending more than one read at a time when 8 pages or more are requested. In conjuntion add support for async_cifs_readpages. -p) Add support for storing symlink and fifo info to Windows servers +p) Add support for storing symlink info to Windows servers in the Extended Attribute format their SFU clients would recognize. q) Finish fcntl D_NOTIFY support so kde and gnome file list windows diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index aa4ea965b329..32cc96cafa3e 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -228,8 +228,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, else { rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,xid); - if(newinode) + if(newinode) { newinode->i_mode = mode; + if((oplock & CIFS_CREATE_ACTION) && + (cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_SET_UID)) { + newinode->i_uid = current->fsuid; + newinode->i_gid = current->fsgid; + } + } } if (rc != 0) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index d34325c887c4..053c1cadf703 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -710,7 +710,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) char *full_path = NULL; struct inode *newinode = NULL; - cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p ", mode, inode)); + cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); xid = GetXid(); @@ -768,7 +768,16 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) /* BB to be implemented via Windows secrty descriptors eg CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, -1, -1, local_nls); */ - } + if(direntry->d_inode) { + direntry->d_inode->i_mode = mode; + if(cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_SET_UID) { + direntry->d_inode->i_uid = + current->fsuid; + direntry->d_inode->i_gid = + current->fsgid; + } + } } kfree(full_path); FreeXid(xid); @@ -1111,9 +1120,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ", direntry->d_name.name, attrs->ia_valid)); + cifs_sb = CIFS_SB(direntry->d_inode->i_sb); pTcon = cifs_sb->tcon; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM == 0) { + /* check if we have permission to change attrs */ + rc = inode_change_ok(direntry->d_inode, attrs); + if(rc < 0) { + FreeXid(xid); + return rc; + } else + rc = 0; + } + down(&direntry->d_sb->s_vfs_rename_sem); full_path = build_path_from_dentry(direntry); up(&direntry->d_sb->s_vfs_rename_sem); @@ -1153,7 +1173,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) 1 /* 45 seconds */); cFYI(1,("Wrt seteof rc %d", rc)); } - } + } else + rc = -EINVAL; + if (rc != 0) { /* Set file size by pathname rather than by handle either because no valid, writeable file handle for -- cgit v1.2.3 From 6ab16d249513a50bef3f1b275cea6aa8d3f51832 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Nov 2005 20:55:11 -0800 Subject: [CIFS] Fix umount --force to wake up the pending response queue, not just the request queue. Also periodically wakeup response_q so threads can check if stuck requests have timed out. Workaround Windows server illegal smb length on transact2 findfirst response. Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 23 ++++++++++++++++++++++- fs/cifs/cifssmb.c | 25 +++++++++++++++++++++++++ fs/cifs/misc.c | 17 ++++++++++++----- fs/cifs/netmisc.c | 4 ++-- fs/cifs/transport.c | 1 + 5 files changed, 62 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 51548ed2e9cc..f4974b41e485 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -429,6 +430,11 @@ static void cifs_umount_begin(struct super_block * sblock) { cFYI(1,("wake up tasks now - umount begin not complete")); wake_up_all(&tcon->ses->server->request_q); + wake_up_all(&tcon->ses->server->response_q); + msleep(1); /* yield */ + /* we have to kick the requests once more */ + wake_up_all(&tcon->ses->server->response_q); + msleep(1); } /* BB FIXME - finish add checks for tidStatus BB */ @@ -895,6 +901,9 @@ static int cifs_oplock_thread(void * dummyarg) static int cifs_dnotify_thread(void * dummyarg) { + struct list_head *tmp; + struct cifsSesInfo *ses; + daemonize("cifsdnotifyd"); allow_signal(SIGTERM); @@ -903,7 +912,19 @@ static int cifs_dnotify_thread(void * dummyarg) if(try_to_freeze()) continue; set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(39*HZ); + schedule_timeout(15*HZ); + read_lock(&GlobalSMBSeslock); + /* check if any stuck requests that need + to be woken up and wakeq so the + thread can wake up and error out */ + list_for_each(tmp, &GlobalSMBSessionList) { + ses = list_entry(tmp, struct cifsSesInfo, + cifsSessionList); + if(ses && ses->server && + atomic_read(&ses->server->inSend)) + wake_up_all(&ses->server->response_q); + } + read_unlock(&GlobalSMBSeslock); } while(!signal_pending(current)); complete_and_exit (&cifs_dnotify_exited, 0); } diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d179b0c3eee4..6867e556d37e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -90,6 +90,18 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, check for tcp and smb session status done differently for those three - in the calling routine */ if(tcon) { + if(tcon->tidStatus == CifsExiting) { + /* only tree disconnect, open, and write, + (and ulogoff which does not have tcon) + are allowed as we start force umount */ + if((smb_command != SMB_COM_WRITE_ANDX) && + (smb_command != SMB_COM_OPEN_ANDX) && + (smb_command != SMB_COM_TREE_DISCONNECT)) { + cFYI(1,("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; + } + } if((tcon->ses) && (tcon->ses->status != CifsExiting) && (tcon->ses->server)){ struct nls_table *nls_codepage; @@ -187,6 +199,19 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, check for tcp and smb session status done differently for those three - in the calling routine */ if(tcon) { + if(tcon->tidStatus == CifsExiting) { + /* only tree disconnect, open, and write, + (and ulogoff which does not have tcon) + are allowed as we start force umount */ + if((smb_command != SMB_COM_WRITE_ANDX) && + (smb_command != SMB_COM_OPEN_ANDX) && + (smb_command != SMB_COM_TREE_DISCONNECT)) { + cFYI(1,("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; + } + } + if((tcon->ses) && (tcon->ses->status != CifsExiting) && (tcon->ses->server)){ struct nls_table *nls_codepage; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index ca27a82c54cd..94baf6c8ecbd 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -397,12 +397,12 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid) if(smb->Command == SMB_COM_LOCKING_ANDX) return 0; else - cERROR(1, ("Rcvd Request not response ")); + cERROR(1, ("Rcvd Request not response")); } } else { /* bad signature or mid */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) cERROR(1, - ("Bad protocol string signature header %x ", + ("Bad protocol string signature header %x", *(unsigned int *) smb->Protocol)); if (mid != smb->Mid) cERROR(1, ("Mids do not match")); @@ -417,7 +417,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) __u32 len = smb->smb_buf_length; __u32 clc_len; /* calculated length */ cFYI(0, - ("Entering checkSMB with Length: %x, smb_buf_length: %x ", + ("Entering checkSMB with Length: %x, smb_buf_length: %x", length, len)); if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) || (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) { @@ -451,9 +451,16 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) cERROR(1, ("bad smb size detected for Mid=%d", smb->Mid)); /* Windows XP can return a few bytes too much, presumably an illegal pad, at the end of byte range lock responses - so we allow for up to eight byte pad, as long as actual + so we allow for that three byte pad, as long as actual received length is as long or longer than calculated length */ - if((4+len > clc_len) && (len <= clc_len + 3)) + /* We have now had to extend this more, since there is a + case in which it needs to be bigger still to handle a + malformed response to transact2 findfirst from WinXP when + access denied is returned and thus bcc and wct are zero + but server says length is 0x21 bytes too long as if the server + forget to reset the smb rfc1001 length when it reset the + wct and bcc to minimum size and drop the t2 parms and data */ + if((4+len > clc_len) && (len <= clc_len + 512)) return 0; else return 1; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index f7814689844b..5de74d216fdd 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -330,7 +330,7 @@ static const struct { ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { ERRSRV, 2241, NT_STATUS_INVALID_LOGON_HOURS}, { ERRSRV, 2240, NT_STATUS_INVALID_WORKSTATION}, { - ERRSRV, 2242, NT_STATUS_PASSWORD_EXPIRED}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { ERRSRV, 2239, NT_STATUS_ACCOUNT_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { @@ -676,7 +676,7 @@ static const struct { ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { - ERRSRV, 2242, NT_STATUS_PASSWORD_MUST_CHANGE}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 41a9659c16bc..f8871196098c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -515,6 +515,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, *pbytes_returned = in_buf->smb_buf_length; /* BB special case reconnect tid and uid here? */ + /* BB special case Errbadpassword and pwdexpired here */ rc = map_smb_to_linux_error(in_buf); /* convert ByteCount if necessary */ -- cgit v1.2.3 From 2a138ebb012ac42c082ae8b40c87c1f265664391 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Nov 2005 21:22:19 -0800 Subject: [CIFS] Missing parenthesis and typo in previous fix Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- fs/cifs/inode.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f4974b41e485..2a13a2bac8f1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -921,7 +921,7 @@ static int cifs_dnotify_thread(void * dummyarg) ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if(ses && ses->server && - atomic_read(&ses->server->inSend)) + atomic_read(&ses->server->inFlight)) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 053c1cadf703..e8773461c7f7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -778,6 +778,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) current->fsgid; } } + } } kfree(full_path); FreeXid(xid); @@ -1124,7 +1125,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) cifs_sb = CIFS_SB(direntry->d_inode->i_sb); pTcon = cifs_sb->tcon; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM == 0) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { /* check if we have permission to change attrs */ rc = inode_change_ok(direntry->d_inode, attrs); if(rc < 0) { -- cgit v1.2.3 From 25741b3e43151bc207dd2b850b0bb157c442682b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Nov 2005 22:38:43 -0800 Subject: [CIFS] For previous fix, mode on mkdir needed S_IFDIR left out. Signed-off-by: Steve French --- fs/cifs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e8773461c7f7..411c1f7f84da 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -770,6 +770,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) -1, -1, local_nls); */ if(direntry->d_inode) { direntry->d_inode->i_mode = mode; + direntry->d_inode->i_mode |= S_IFDIR; if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { direntry->d_inode->i_uid = -- cgit v1.2.3 From 223db122bfccd463751d8b0c09a638abee03681d Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Wed, 30 Nov 2005 09:25:33 -0500 Subject: NFS: Fix cache consistency regression Make sure cache_change_attribute is initialized to jiffies so when the mtime changes on directory, the directory will be refreshed. Signed-off by: Steve Dickson Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index aaab1a5ac461..b551b19aa6e7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2066,6 +2066,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; + nfsi->cache_change_attribute = jiffies; #ifdef CONFIG_NFS_V3_ACL nfsi->acl_access = ERR_PTR(-EAGAIN); nfsi->acl_default = ERR_PTR(-EAGAIN); -- cgit v1.2.3 From 24aa1fe6779eaddb3e0b1b802585dcf6faf9cc44 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Dec 2005 15:20:07 -0500 Subject: NFS: Fix a few further cache consistency regressions Steve Dickson writes: Doing the following: 1. On server: $ mkdir ~/t $ echo Hello > ~/t/tmp 2. On client, wait for a string to appear in this file: $ until grep -q foo t/tmp ; do echo -n . ; sleep 1 ; done 3. On server, create a *new* file with the same name containing that string: $ mv ~/t/tmp ~/t/tmp.old; echo foo > ~/t/tmp will show how the client will never (and I mean never ;-) ) see the updated file. The problem is that we do not update nfsi->cache_change_attribute when the file changes on the server (we only update it when our client makes the changes). This again means that functions like nfs_check_verifier() will fail to register when the parent directory has changed and should trigger a dentry lookup revalidation. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 54 ++++++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b551b19aa6e7..afd75d0463fd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -54,7 +54,7 @@ #define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) static void nfs_invalidate_inode(struct inode *); -static int nfs_update_inode(struct inode *, struct nfs_fattr *, unsigned long); +static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct inode *nfs_alloc_inode(struct super_block *sb); static void nfs_destroy_inode(struct inode *); @@ -1080,8 +1080,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) int status = -ESTALE; struct nfs_fattr fattr; struct nfs_inode *nfsi = NFS_I(inode); - unsigned long verifier; - unsigned long cache_validity; dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); @@ -1106,8 +1104,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } } - /* Protect against RPC races by saving the change attribute */ - verifier = nfs_save_change_attribute(inode); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -1122,7 +1118,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr, verifier); + status = nfs_update_inode(inode, &fattr); if (status) { spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", @@ -1130,20 +1126,11 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode), status); goto out; } - cache_validity = nfsi->cache_validity; - nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; - - /* - * We may need to keep the attributes marked as invalid if - * we raced with nfs_end_attr_update(). - */ - if (time_after_eq(verifier, nfsi->cache_change_attribute)) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); nfs_revalidate_mapping(inode, inode->i_mapping); - if (cache_validity & NFS_INO_INVALID_ACL) + if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", @@ -1346,10 +1333,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) return 0; spin_lock(&inode->i_lock); nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; - if (nfs_verify_change_attribute(inode, fattr->time_start)) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr, fattr->time_start); + status = nfs_update_inode(inode, fattr); else status = nfs_check_inode_attributes(inode, fattr); @@ -1375,10 +1360,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; goto out; } - status = nfs_update_inode(inode, fattr, fattr->time_start); - if (time_after_eq(fattr->time_start, nfsi->cache_change_attribute)) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); - nfsi->cache_change_attribute = jiffies; + status = nfs_update_inode(inode, fattr); out: spin_unlock(&inode->i_lock); return status; @@ -1396,12 +1378,12 @@ out: * * A very similar scenario holds for the dir cache. */ -static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier) +static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; unsigned int invalid = 0; - int data_unstable; + int data_stable; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __FUNCTION__, inode->i_sb->s_id, inode->i_ino, @@ -1432,8 +1414,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign nfsi->last_updated = jiffies; /* Are we racing with known updates of the metadata on the server? */ - data_unstable = ! (nfs_verify_change_attribute(inode, verifier) || - (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)); + data_stable = nfs_verify_change_attribute(inode, fattr->time_start); + if (data_stable) + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); /* Check if our cached file size is stale */ new_isize = nfs_size_to_loff_t(fattr->size); @@ -1442,7 +1425,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign /* Do we perhaps have any outstanding writes? */ if (nfsi->npages == 0) { /* No, but did we race with nfs_end_data_update()? */ - if (time_after_eq(verifier, nfsi->cache_change_attribute)) { + if (data_stable) { inode->i_size = new_isize; invalid |= NFS_INO_INVALID_DATA; } @@ -1451,6 +1434,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign inode->i_size = new_isize; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } + nfsi->cache_change_attribute = jiffies; dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } @@ -1460,8 +1444,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); dprintk("NFS: mtime change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - if (!data_unstable) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + nfsi->cache_change_attribute = jiffies; } if ((fattr->valid & NFS_ATTR_FATTR_V4) @@ -1469,15 +1453,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); nfsi->change_attr = fattr->change_attr; - if (!data_unstable) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfsi->cache_change_attribute = jiffies; } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { - if (!data_unstable) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + nfsi->cache_change_attribute = jiffies; } memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); @@ -1515,6 +1499,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; + if (data_stable) + invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); if (!nfs_have_delegation(inode, FMODE_READ)) nfsi->cache_validity |= invalid; -- cgit v1.2.3 From bb713d6d38f7be4f4e7d790cddb1b076e7da6699 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Dec 2005 15:20:14 -0500 Subject: NFS: use set_page_writeback() in the appropriate places Ensure that we use set_page_writeback() in the appropriate places to help the VM in keeping its page radix_tree in sync. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8f71e766cc5d..3107908e5f3f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -189,6 +189,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, (long long)NFS_FILEID(inode), count, (long long)(page_offset(page) + offset)); + set_page_writeback(page); nfs_begin_data_update(inode); do { if (count < wsize) @@ -221,6 +222,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, io_error: nfs_end_data_update(inode); + end_page_writeback(page); nfs_writedata_free(wdata); return written ? written : result; } @@ -929,7 +931,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how) atomic_set(&req->wb_complete, requests); ClearPageError(page); - SetPageWriteback(page); + set_page_writeback(page); offset = 0; nbytes = req->wb_bytes; do { @@ -992,7 +994,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how) nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); ClearPageError(req->wb_page); - SetPageWriteback(req->wb_page); + set_page_writeback(req->wb_page); *pages++ = req->wb_page; count += req->wb_bytes; } -- cgit v1.2.3 From 5ba7cc4801ae0fe74b6e0160f008521ae71d9f5d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Dec 2005 15:20:17 -0500 Subject: NFS: Fix post-op attribute revalidation... - Missing nfs_mark_for_revalidate in nfs_proc_link() - Missing nfs_mark_for_revalidate in nfs_rename() Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 +++ fs/nfs/proc.c | 1 + 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7370583b61e5..c0d1a214572c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1287,6 +1287,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name); nfs_begin_data_update(dentry->d_inode); error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly); + nfs_mark_for_revalidate(dentry->d_inode); nfs_end_data_update(dentry->d_inode); } else error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, @@ -1334,6 +1335,7 @@ static int nfs_safe_remove(struct dentry *dentry) /* The VFS may want to delete this inode */ if (error == 0) inode->i_nlink--; + nfs_mark_for_revalidate(inode); nfs_end_data_update(inode); } else error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); @@ -1556,6 +1558,7 @@ go_ahead: nfs_begin_data_update(old_inode); error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); + nfs_mark_for_revalidate(old_inode); nfs_end_data_update(old_inode); nfs_end_data_update(new_dir); nfs_end_data_update(old_dir); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index a48a003242c0..e1e3ca5d746b 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -375,6 +375,7 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) dprintk("NFS call link %s\n", name->name); status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0); + nfs_mark_for_revalidate(inode); nfs_mark_for_revalidate(dir); dprintk("NFS reply link: %d\n", status); return status; -- cgit v1.2.3 From 3b6efee9231e12fce09c94930bfc59f66f18d662 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Dec 2005 15:20:21 -0500 Subject: NFSv4: Fix an Oops in the synchronous write path - Missing initialisation of attribute bitmask in _nfs4_proc_write() - On success, _nfs4_proc_write() must return number of bytes written. - Missing post_op_update_inode() in _nfs4_proc_write() - Missing initialisation of attribute bitmask in _nfs4_proc_commit() - Missing post_op_update_inode() in _nfs4_proc_commit() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 60e0dd800cc3..f988a9417b13 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1506,10 +1506,15 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata) dprintk("NFS call write %d @ %Ld\n", wdata->args.count, (long long) wdata->args.offset); + wdata->args.bitmask = server->attr_bitmask; + wdata->res.server = server; nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, rpcflags); dprintk("NFS reply write: %d\n", status); - return status; + if (status < 0) + return status; + nfs_post_op_update_inode(inode, fattr); + return wdata->res.count; } static int nfs4_proc_write(struct nfs_write_data *wdata) @@ -1540,9 +1545,13 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata) dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, (long long) cdata->args.offset); + cdata->args.bitmask = server->attr_bitmask; + cdata->res.server = server; nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply commit: %d\n", status); + if (status >= 0) + nfs_post_op_update_inode(inode, fattr); return status; } -- cgit v1.2.3