diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-08 21:47:09 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-08 21:47:09 +0200 |
commit | ca687877e05ad1bf5b4cefd9cdd091044626deac (patch) | |
tree | 6b61bf62f2d87729fcbdd125f483792a71165016 | |
parent | Merge tag 's390-5.8-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/... (diff) | |
parent | Merge branch 'gfs2-iopen' into for-next (diff) | |
download | linux-ca687877e05ad1bf5b4cefd9cdd091044626deac.tar.xz linux-ca687877e05ad1bf5b4cefd9cdd091044626deac.zip |
Merge tag 'gfs2-for-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 updates from Andreas Gruenbacher:
- An iopen glock locking scheme rework that speeds up deletes of inodes
accessed from multiple nodes
- Various bug fixes and debugging improvements
- Convert gfs2-glocks.txt to ReST
* tag 'gfs2-for-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2:
gfs2: fix use-after-free on transaction ail lists
gfs2: new slab for transactions
gfs2: initialize transaction tr_ailX_lists earlier
gfs2: Smarter iopen glock waiting
gfs2: Wake up when setting GLF_DEMOTE
gfs2: Check inode generation number in delete_work_func
gfs2: Move inode generation number check into gfs2_inode_lookup
gfs2: Minor gfs2_lookup_by_inum cleanup
gfs2: Try harder to delete inodes locally
gfs2: Give up the iopen glock on contention
gfs2: Turn gl_delete into a delayed work
gfs2: Keep track of deleted inode generations in LVBs
gfs2: Allow ASPACE glocks to also have an lvb
gfs2: instrumentation wrt log_flush stuck
gfs2: introduce new gfs2_glock_assert_withdraw
gfs2: print mapping->nrpages in glock dump for address space glocks
gfs2: Only do glock put in gfs2_create_inode for free inodes
gfs2: Allow lock_nolock mount to specify jid=X
gfs2: Don't ignore inode write errors during inode_go_sync
docs: filesystems: convert gfs2-glocks.txt to ReST
-rw-r--r-- | Documentation/filesystems/gfs2-glocks.rst (renamed from Documentation/filesystems/gfs2-glocks.txt) | 149 | ||||
-rw-r--r-- | Documentation/filesystems/index.rst | 1 | ||||
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | fs/gfs2/export.c | 4 | ||||
-rw-r--r-- | fs/gfs2/glock.c | 208 | ||||
-rw-r--r-- | fs/gfs2/glock.h | 16 | ||||
-rw-r--r-- | fs/gfs2/glops.c | 21 | ||||
-rw-r--r-- | fs/gfs2/incore.h | 9 | ||||
-rw-r--r-- | fs/gfs2/inode.c | 47 | ||||
-rw-r--r-- | fs/gfs2/inode.h | 2 | ||||
-rw-r--r-- | fs/gfs2/log.c | 56 | ||||
-rw-r--r-- | fs/gfs2/main.c | 9 | ||||
-rw-r--r-- | fs/gfs2/ops_fstype.c | 2 | ||||
-rw-r--r-- | fs/gfs2/rgrp.c | 2 | ||||
-rw-r--r-- | fs/gfs2/super.c | 72 | ||||
-rw-r--r-- | fs/gfs2/trans.c | 21 | ||||
-rw-r--r-- | fs/gfs2/trans.h | 1 | ||||
-rw-r--r-- | fs/gfs2/util.c | 1 | ||||
-rw-r--r-- | fs/gfs2/util.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/gfs2_ondisk.h | 6 |
20 files changed, 489 insertions, 141 deletions
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.rst index 7059623635b2..d14f230f0b12 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.rst @@ -1,5 +1,8 @@ - Glock internal locking rules - ------------------------------ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Glock internal locking rules +============================ This documents the basic principles of the glock state machine internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h) @@ -24,24 +27,28 @@ There are three lock states that users of the glock layer can request, namely shared (SH), deferred (DF) and exclusive (EX). Those translate to the following DLM lock modes: -Glock mode | DLM lock mode ------------------------------- - UN | IV/NL Unlocked (no DLM lock associated with glock) or NL - SH | PR (Protected read) - DF | CW (Concurrent write) - EX | EX (Exclusive) +========== ====== ===================================================== +Glock mode DLM lock mode +========== ====== ===================================================== + UN IV/NL Unlocked (no DLM lock associated with glock) or NL + SH PR (Protected read) + DF CW (Concurrent write) + EX EX (Exclusive) +========== ====== ===================================================== Thus DF is basically a shared mode which is incompatible with the "normal" shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O operations. The glocks are basically a lock plus some routines which deal with cache management. The following rules apply for the cache: -Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata --------------------------------------------------------------------------- - UN | No | No | No | No - SH | Yes | Yes | No | No - DF | No | Yes | No | No - EX | Yes | Yes | Yes | Yes +========== ========== ============== ========== ============== +Glock mode Cache data Cache Metadata Dirty Data Dirty Metadata +========== ========== ============== ========== ============== + UN No No No No + SH Yes Yes No No + DF No Yes No No + EX Yes Yes Yes Yes +========== ========== ============== ========== ============== These rules are implemented using the various glock operations which are defined for each type of glock. Not all types of glocks use @@ -49,21 +56,23 @@ all the modes. Only inode glocks use the DF mode for example. Table of glock operations and per type constants: -Field | Purpose ----------------------------------------------------------------------------- -go_xmote_th | Called before remote state change (e.g. to sync dirty data) -go_xmote_bh | Called after remote state change (e.g. to refill cache) -go_inval | Called if remote state change requires invalidating the cache -go_demote_ok | Returns boolean value of whether its ok to demote a glock - | (e.g. checks timeout, and that there is no cached data) -go_lock | Called for the first local holder of a lock -go_unlock | Called on the final local unlock of a lock -go_dump | Called to print content of object for debugfs file, or on - | error to dump glock to the log. -go_type | The type of the glock, LM_TYPE_..... -go_callback | Called if the DLM sends a callback to drop this lock -go_flags | GLOF_ASPACE is set, if the glock has an address space - | associated with it +============= ============================================================= +Field Purpose +============= ============================================================= +go_xmote_th Called before remote state change (e.g. to sync dirty data) +go_xmote_bh Called after remote state change (e.g. to refill cache) +go_inval Called if remote state change requires invalidating the cache +go_demote_ok Returns boolean value of whether its ok to demote a glock + (e.g. checks timeout, and that there is no cached data) +go_lock Called for the first local holder of a lock +go_unlock Called on the final local unlock of a lock +go_dump Called to print content of object for debugfs file, or on + error to dump glock to the log. +go_type The type of the glock, ``LM_TYPE_*`` +go_callback Called if the DLM sends a callback to drop this lock +go_flags GLOF_ASPACE is set, if the glock has an address space + associated with it +============= ============================================================= The minimum hold time for each lock is the time after a remote lock grant for which we ignore remote demote requests. This is in order to @@ -82,21 +91,25 @@ rather than via the glock. Locking rules for glock operations: -Operation | GLF_LOCK bit lock held | gl_lockref.lock spinlock held -------------------------------------------------------------------------- -go_xmote_th | Yes | No -go_xmote_bh | Yes | No -go_inval | Yes | No -go_demote_ok | Sometimes | Yes -go_lock | Yes | No -go_unlock | Yes | No -go_dump | Sometimes | Yes -go_callback | Sometimes (N/A) | Yes - -N.B. Operations must not drop either the bit lock or the spinlock -if its held on entry. go_dump and do_demote_ok must never block. -Note that go_dump will only be called if the glock's state -indicates that it is caching uptodate data. +============= ====================== ============================= +Operation GLF_LOCK bit lock held gl_lockref.lock spinlock held +============= ====================== ============================= +go_xmote_th Yes No +go_xmote_bh Yes No +go_inval Yes No +go_demote_ok Sometimes Yes +go_lock Yes No +go_unlock Yes No +go_dump Sometimes Yes +go_callback Sometimes (N/A) Yes +============= ====================== ============================= + +.. Note:: + + Operations must not drop either the bit lock or the spinlock + if its held on entry. go_dump and do_demote_ok must never block. + Note that go_dump will only be called if the glock's state + indicates that it is caching uptodate data. Glock locking order within GFS2: @@ -104,7 +117,7 @@ Glock locking order within GFS2: 2. Rename glock (for rename only) 3. Inode glock(s) (Parents before children, inodes at "same level" with same parent in - lock number order) + lock number order) 4. Rgrp glock(s) (for (de)allocation operations) 5. Transaction glock (via gfs2_trans_begin) for non-read operations 6. i_rw_mutex (if required) @@ -117,8 +130,8 @@ determine the lifetime of the inode in question. Locking of inodes is on a per-inode basis. Locking of rgrps is on a per rgrp basis. In general we prefer to lock local locks prior to cluster locks. - Glock Statistics - ------------------ +Glock Statistics +---------------- The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The @@ -173,8 +186,8 @@ we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for -allocation (to base it on lock wait time, rather than blindly -using a "try lock") + allocation (to base it on lock wait time, rather than blindly + using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken @@ -195,10 +208,13 @@ as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. -Per sb stats can be found here: -/sys/kernel/debug/gfs2/<fsname>/sbstats -Per glock stats can be found here: -/sys/kernel/debug/gfs2/<fsname>/glstats +Per sb stats can be found here:: + + /sys/kernel/debug/gfs2/<fsname>/sbstats + +Per glock stats can be found here:: + + /sys/kernel/debug/gfs2/<fsname>/glstats Assuming that debugfs is mounted on /sys/kernel/debug and also that <fsname> is replaced with the name of the gfs2 filesystem @@ -206,14 +222,16 @@ in question. The abbreviations used in the output as are follows: -srtt - Smoothed round trip time for non-blocking dlm requests -srttvar - Variance estimate for srtt -srttb - Smoothed round trip time for (potentially) blocking dlm requests -srttvarb - Variance estimate for srttb -sirt - Smoothed inter-request time (for dlm requests) -sirtvar - Variance estimate for sirt -dlm - Number of dlm requests made (dcnt in glstats file) -queue - Number of glock requests queued (qcnt in glstats file) +========= ================================================================ +srtt Smoothed round trip time for non blocking dlm requests +srttvar Variance estimate for srtt +srttb Smoothed round trip time for (potentially) blocking dlm requests +srttvarb Variance estimate for srttb +sirt Smoothed inter request time (for dlm requests) +sirtvar Variance estimate for sirt +dlm Number of dlm requests made (dcnt in glstats file) +queue Number of glock requests queued (qcnt in glstats file) +========= ================================================================ The sbstats file contains a set of these stats for each glock type (so 8 lines for each type) and for each cpu (one column per cpu). The glstats file contains @@ -224,9 +242,12 @@ The gfs2_glock_lock_time tracepoint prints out the current values of the stats for the glock in question, along with some addition information on each dlm reply that is received: -status - The status of the dlm request -flags - The dlm request flags -tdiff - The time taken by this specific request +====== ======================================= +status The status of the dlm request +flags The dlm request flags +tdiff The time taken by this specific request +====== ======================================= + (remaining fields as per above list) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 17795341e0a3..4c536e66dc4c 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -88,6 +88,7 @@ Documentation for filesystem implementations. f2fs gfs2 gfs2-uevents + gfs2-glocks hfs hfsplus hpfs diff --git a/MAINTAINERS b/MAINTAINERS index 3ad30018c3f1..ad076ddac82b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7251,7 +7251,7 @@ L: cluster-devel@redhat.com S: Supported W: http://sources.redhat.com/cluster/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git -F: Documentation/filesystems/gfs2*.txt +F: Documentation/filesystems/gfs2* F: fs/gfs2/ F: include/uapi/linux/gfs2_ondisk.h diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 3f717285ee48..756d05779200 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -134,7 +134,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + if (!inum->no_formal_ino) + return ERR_PTR(-ESTALE); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino, GFS2_BLKST_DINODE); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index bf70e3b14938..2299dcc417ea 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -125,12 +125,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); - if (gl->gl_ops->go_flags & GLOF_ASPACE) { + kfree(gl->gl_lksb.sb_lvbptr); + if (gl->gl_ops->go_flags & GLOF_ASPACE) kmem_cache_free(gfs2_glock_aspace_cachep, gl); - } else { - kfree(gl->gl_lksb.sb_lvbptr); + else kmem_cache_free(gfs2_glock_cachep, gl); - } } /** @@ -164,7 +163,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - BUG_ON(atomic_read(&gl->gl_revokes)); + gfs2_glock_assert_withdraw(gl, atomic_read(&gl->gl_revokes) == 0); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); @@ -465,6 +464,15 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) gl->gl_tchange = jiffies; } +static void gfs2_set_demote(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + set_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb(); + wake_up(&sdp->sd_async_glock_wait); +} + static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; @@ -626,7 +634,8 @@ __acquires(&gl->gl_lockref.lock) */ if ((atomic_read(&gl->gl_ail_count) != 0) && (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { - gfs2_assert_warn(sdp, !atomic_read(&gl->gl_ail_count)); + gfs2_glock_assert_warn(gl, + !atomic_read(&gl->gl_ail_count)); gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); @@ -756,20 +765,127 @@ out_unlock: return; } +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic == 0) + ri->ri_magic = cpu_to_be32(GFS2_MAGIC); + if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) + ri->ri_generation_deleted = cpu_to_be64(generation); +} + +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) + return false; + return generation <= be64_to_cpu(ri->ri_generation_deleted); +} + +static void gfs2_glock_poke(struct gfs2_glock *gl) +{ + int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, flags, &gh); + if (!error) + gfs2_glock_dq(&gh); +} + +static bool gfs2_try_evict(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + bool evicted = false; + + /* + * If there is contention on the iopen glock and we have an inode, try + * to grab and release the inode so that it can be evicted. This will + * allow the remote node to go ahead and delete the inode without us + * having to do it, which will avoid rgrp glock thrashing. + * + * The remote node is likely still holding the corresponding inode + * glock, so it will run before we get to verify that the delete has + * happened below. + */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + struct gfs2_glock *inode_gl = NULL; + + gl->gl_no_formal_ino = ip->i_no_formal_ino; + set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + d_prune_aliases(&ip->i_inode); + iput(&ip->i_inode); + + /* If the inode was evicted, gl->gl_object will now be NULL. */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) { + inode_gl = ip->i_gl; + lockref_get(&inode_gl->gl_lockref); + clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + } + spin_unlock(&gl->gl_lockref.lock); + if (inode_gl) { + gfs2_glock_poke(inode_gl); + gfs2_glock_put(inode_gl); + } + evicted = !ip; + } + return evicted; +} + static void delete_work_func(struct work_struct *work) { - struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct delayed_work *dwork = to_delayed_work(work); + struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; + spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + /* If someone's using this glock to create a new dinode, the block must have been freed by another node, then re-used, in which case our iopen callback is too late after the fact. Ignore it. */ if (test_bit(GLF_INODE_CREATING, &gl->gl_flags)) goto out; - inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + /* + * If we can evict the inode, give the remote node trying to + * delete the inode some time before verifying that the delete + * has happened. Otherwise, if we cause contention on the inode glock + * immediately, the remote node will think that we still have + * the inode in use, and so it will give up waiting. + * + * If we can't evict the inode, signal to the remote node that + * the inode is still in use. We'll later try to delete the + * inode locally in gfs2_evict_inode. + * + * FIXME: We only need to verify that the remote node has + * deleted the inode because nodes before this remote delete + * rework won't cooperate. At a later time, when we no longer + * care about compatibility with such nodes, we can skip this + * step entirely. + */ + if (gfs2_try_evict(gl)) { + if (gfs2_queue_delete_work(gl, 5 * HZ)) + return; + } + goto out; + } + + inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, + GFS2_BLKST_UNLINKED); if (!IS_ERR_OR_NULL(inode)) { d_prune_aliases(inode); iput(inode); @@ -800,7 +916,7 @@ static void glock_work_func(struct work_struct *work) if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); - set_bit(GLF_DEMOTE, &gl->gl_flags); + gfs2_set_demote(gl); } } run_queue(gl, 0); @@ -931,7 +1047,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_WORK(&gl->gl_delete, delete_work_func); + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1145,9 +1261,10 @@ wait_for_dlm: static void handle_callback(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { - int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; - - set_bit(bit, &gl->gl_flags); + if (delay) + set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + else + gfs2_set_demote(gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; @@ -1754,6 +1871,44 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) rhashtable_walk_exit(&iter); } +bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay) +{ + bool queued; + + spin_lock(&gl->gl_lockref.lock); + queued = queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, delay); + if (queued) + set_bit(GLF_PENDING_DELETE, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + return queued; +} + +void gfs2_cancel_delete_work(struct gfs2_glock *gl) +{ + if (cancel_delayed_work_sync(&gl->gl_delete)) { + clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + gfs2_glock_put(gl); + } +} + +bool gfs2_delete_work_queued(const struct gfs2_glock *gl) +{ + return test_bit(GLF_PENDING_DELETE, &gl->gl_flags); +} + +static void flush_delete_work(struct gfs2_glock *gl) +{ + flush_delayed_work(&gl->gl_delete); + gfs2_glock_queue_work(gl, 0); +} + +void gfs2_flush_delete_work(struct gfs2_sbd *sdp) +{ + glock_hash_walk(flush_delete_work, sdp); + flush_workqueue(gfs2_delete_workqueue); +} + /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw @@ -1836,7 +1991,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip) int ret; ret = gfs2_truncatei_resume(ip); - gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0); + gfs2_glock_assert_withdraw(gl, ret == 0); spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_LOCK, &gl->gl_flags); @@ -1978,7 +2133,13 @@ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) char gflags_buf[32]; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; + unsigned long nrpages = 0; + + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct address_space *mapping = gfs2_glock2aspace(gl); + nrpages = mapping->nrpages; + } memset(fs_id_buf, 0, sizeof(fs_id_buf)); if (fsid && sdp) /* safety precaution */ sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); @@ -1987,15 +2148,16 @@ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " - "v:%d r:%d m:%ld\n", fs_id_buf, state2str(gl->gl_state), - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number, - gflags2str(gflags_buf, gl), - state2str(gl->gl_target), - state2str(gl->gl_demote_state), dtime, - atomic_read(&gl->gl_ail_count), - atomic_read(&gl->gl_revokes), - (int)gl->gl_lockref.count, gl->gl_hold_time); + "v:%d r:%d m:%ld p:%lu\n", + fs_id_buf, state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); list_for_each_entry(gh, &gl->gl_holders, gh_list) dump_holder(seq, gh, fs_id_buf); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..53813364517b 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -205,6 +205,15 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \ gfs2_dump_glock(NULL, gl, true); \ BUG(); } } while(0) +#define gfs2_glock_assert_warn(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_warn((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) +#define gfs2_glock_assert_withdraw(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) + extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); @@ -235,6 +244,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay); +extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); +extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); +extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); @@ -306,4 +319,7 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(&gl->gl_lockref.lock); } +extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 9e9c7a4b8c66..c84887769b5a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -91,6 +91,8 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) memset(&tr, 0, sizeof(tr)); INIT_LIST_HEAD(&tr.tr_buf); INIT_LIST_HEAD(&tr.tr_databuf); + INIT_LIST_HEAD(&tr.tr_ail1_list); + INIT_LIST_HEAD(&tr.tr_ail2_list); tr.tr_revokes = atomic_read(&gl->gl_ail_count); if (!tr.tr_revokes) { @@ -268,7 +270,7 @@ static int inode_go_sync(struct gfs2_glock *gl) struct gfs2_inode *ip = gfs2_glock2inode(gl); int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); - int error = 0; + int error = 0, ret; if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) @@ -289,8 +291,10 @@ static int inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + ret = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, ret); + if (!error) + error = ret; gfs2_ail_empty_gl(gl); /* * Writeback of the data mapping may cause the dirty flag to be set @@ -608,11 +612,17 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { gl->gl_lockref.count++; - if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (!queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0)) gl->gl_lockref.count--; } } +static int iopen_go_demote_ok(const struct gfs2_glock *gl) +{ + return !gfs2_delete_work_queued(gl); +} + /** * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it * @gl: glock being freed @@ -692,7 +702,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_flags = GLOF_ASPACE | GLOF_LRU, + .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, .go_free = inode_go_free, }; @@ -716,6 +726,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 84a824293a78..03ab11fab962 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -345,6 +345,7 @@ enum { GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, GLF_INODE_CREATING = 16, /* Inode creation occurring */ + GLF_PENDING_DELETE = 17, GLF_FREEING = 18, /* Wait for glock to be freed */ }; @@ -378,8 +379,11 @@ struct gfs2_glock { atomic_t gl_revokes; struct delayed_work gl_work; union { - /* For inode and iopen glocks only */ - struct work_struct gl_delete; + /* For iopen glocks only */ + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; + }; /* For rgrp glocks only */ struct { loff_t start; @@ -398,6 +402,7 @@ enum { GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, + GIF_DEFERRED_DELETE = 7, }; struct gfs2_inode { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a1337bf31e49..370c3a4b31ac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -115,6 +115,10 @@ static void gfs2_set_iop(struct inode *inode) * placeholder because it doesn't otherwise make sense), the on-disk block type * is verified to be @blktype. * + * When @no_formal_ino is non-zero, this function will return ERR_PTR(-ESTALE) + * if it detects that @no_formal_ino doesn't match the actual inode generation + * number. However, it doesn't always know unless @type is DT_UNKNOWN. + * * Returns: A VFS inode, or an error */ @@ -158,6 +162,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (error) goto fail; + error = -ESTALE; + if (no_formal_ino && + gfs2_inode_already_deleted(ip->i_gl, no_formal_ino)) + goto fail; + if (blktype != GFS2_BLKST_FREE) { error = gfs2_check_blk_type(sdp, no_addr, blktype); @@ -171,6 +180,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -189,13 +199,23 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, inode->i_mode = DT2IF(type); } + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); + gfs2_set_iop(inode); + } - unlock_new_inode(inode); + if (no_formal_ino && ip->i_no_formal_ino && + no_formal_ino != ip->i_no_formal_ino) { + if (inode->i_state & I_NEW) + goto fail; + iput(inode); + return ERR_PTR(-ESTALE); } - if (gfs2_holder_initialized(&i_gh)) - gfs2_glock_dq_uninit(&i_gh); + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + return inode; fail: @@ -207,23 +227,26 @@ fail: return ERR_PTR(error); } +/** + * gfs2_lookup_by_inum - look up an inode by inode number + * @sdp: The super block + * @no_addr: The inode number + * @no_formal_ino: The inode generation number (0 for any) + * @blktype: Requested block type (see gfs2_inode_lookup) + */ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 *no_formal_ino, unsigned int blktype) + u64 no_formal_ino, unsigned int blktype) { struct super_block *sb = sdp->sd_vfs; struct inode *inode; int error; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, no_formal_ino, + blktype); if (IS_ERR(inode)) return inode; - /* Two extra checks for NFS only */ if (no_formal_ino) { - error = -ESTALE; - if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) - goto fail_iput; - error = -EIO; if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) goto fail_iput; @@ -725,6 +748,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -781,7 +805,8 @@ fail_gunlock2: fail_free_inode: if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); - gfs2_glock_put(ip->i_gl); + if (free_vfs_inode) /* else evict will do the put for us */ + gfs2_glock_put(ip->i_gl); } gfs2_rs_delete(ip, NULL); gfs2_qa_put(ip); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 580adbf0b5e1..b52ecf4ffe63 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -92,7 +92,7 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino, unsigned int blktype); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 *no_formal_ino, + u64 no_formal_ino, unsigned int blktype); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0644e58c6191..3e4734431783 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -30,6 +30,7 @@ #include "util.h" #include "dir.h" #include "trace_gfs2.h" +#include "trans.h" static void gfs2_log_shutdown(struct gfs2_sbd *sdp); @@ -145,9 +146,6 @@ static void dump_ail_list(struct gfs2_sbd *sdp) struct gfs2_bufdata *bd; struct buffer_head *bh; - fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n", - current->journal_info ? 1 : 0); - list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry_reverse(bd, &tr->tr_ail1_list, bd_ail_st_list) { @@ -197,6 +195,8 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) restart: ret = 0; if (time_after(jiffies, flush_start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for ten minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); dump_ail_list(sdp); goto out; } @@ -379,7 +379,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) list_del(&tr->tr_list); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); - kfree(tr); + gfs2_trans_free(sdp, tr); } spin_unlock(&sdp->sd_ail_lock); @@ -864,19 +864,41 @@ static void ail_drain(struct gfs2_sbd *sdp) gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); - kfree(tr); + gfs2_trans_free(sdp, tr); } while (!list_empty(&sdp->sd_ail2_list)) { tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans, tr_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); - kfree(tr); + gfs2_trans_free(sdp, tr); } spin_unlock(&sdp->sd_ail_lock); } /** + * empty_ail1_list - try to start IO and empty the ail1 list + * @sdp: Pointer to GFS2 superblock + */ +static void empty_ail1_list(struct gfs2_sbd *sdp) +{ + unsigned long start = jiffies; + + for (;;) { + if (time_after(jiffies, start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); + dump_ail_list(sdp); + return; + } + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp, 0)) + return; + } +} + +/** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log @@ -912,8 +934,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; - INIT_LIST_HEAD(&tr->tr_ail1_list); - INIT_LIST_HEAD(&tr->tr_ail2_list); tr->tr_first = sdp->sd_log_flush_head; if (unlikely (state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, @@ -965,12 +985,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { - for (;;) { - gfs2_ail1_start(sdp); - gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp, 0)) - break; - } + empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) goto out; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ @@ -994,7 +1009,7 @@ out: trace_gfs2_log_flush(sdp, 0, flags); up_write(&sdp->sd_log_flush_lock); - kfree(tr); + gfs2_trans_free(sdp, tr); } /** @@ -1003,8 +1018,10 @@ out: * @new: New transaction to be merged */ -static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) { + struct gfs2_trans *old = sdp->sd_log_tr; + WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags)); old->tr_num_buf_new += new->tr_num_buf_new; @@ -1016,6 +1033,11 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); + + spin_lock(&sdp->sd_ail_lock); + list_splice_tail_init(&new->tr_ail1_list, &old->tr_ail1_list); + list_splice_tail_init(&new->tr_ail2_list, &old->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) @@ -1027,7 +1049,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_lock(sdp); if (sdp->sd_log_tr) { - gfs2_merge_trans(sdp->sd_log_tr, tr); + gfs2_merge_trans(sdp, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags)); sdp->sd_log_tr = tr; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a1a295b739fb..733470ca6be9 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -143,6 +143,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail_cachep7; + gfs2_trans_cachep = kmem_cache_create("gfs2_trans", + sizeof(struct gfs2_trans), + 0, 0, NULL); + if (!gfs2_trans_cachep) + goto fail_cachep8; + error = register_shrinker(&gfs2_qd_shrinker); if (error) goto fail_shrinker; @@ -194,6 +200,8 @@ fail_fs2: fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: + kmem_cache_destroy(gfs2_trans_cachep); +fail_cachep8: kmem_cache_destroy(gfs2_qadata_cachep); fail_cachep7: kmem_cache_destroy(gfs2_quotad_cachep); @@ -236,6 +244,7 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_trans_cachep); kmem_cache_destroy(gfs2_qadata_cachep); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e2b69ffcc6a8..094f5fe7c009 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -880,7 +880,7 @@ fail: } static const match_table_t nolock_tokens = { - { Opt_jid, "jid=%d\n", }, + { Opt_jid, "jid=%d", }, { Opt_err, NULL }, }; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index a321c34e3d6e..074f228ea839 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1835,7 +1835,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip */ ip = gl->gl_object; - if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (ip || !gfs2_queue_delete_work(gl, 0)) gfs2_glock_put(gl); else found++; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 956fced0a8ec..32d8d26126a1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -626,7 +626,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) } } - flush_workqueue(gfs2_delete_workqueue); + gfs2_flush_delete_work(sdp); if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); else if (sdp->sd_quotad_process) @@ -1054,7 +1054,7 @@ static int gfs2_drop_inode(struct inode *inode) struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); - if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (!gfs2_queue_delete_work(gl, 0)) gfs2_glock_queue_put(gl); return false; } @@ -1258,6 +1258,55 @@ static void gfs2_glock_put_eventually(struct gfs2_glock *gl) gfs2_glock_put(gl); } +static bool gfs2_upgrade_iopen_glock(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *gh = &ip->i_iopen_gh; + long timeout = 5 * HZ; + int error; + + gh->gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(gh); + + /* + * If there are no other lock holders, we'll get the lock immediately. + * Otherwise, the other nodes holding the lock will be notified about + * our locking request. If they don't have the inode open, they'll + * evict the cached inode and release the lock. Otherwise, if they + * poke the inode glock, we'll take this as an indication that they + * still need the iopen glock and that they'll take care of deleting + * the inode when they're done. As a last resort, if another node + * keeps holding the iopen glock without showing any activity on the + * inode glock, we'll eventually time out. + * + * Note that we're passing the LM_FLAG_TRY_1CB flag to the first + * locking request as an optimization to notify lock holders as soon as + * possible. Without that flag, they'd be notified implicitly by the + * second locking request. + */ + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error != GLR_TRYFAILED) + return !error; + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error) + return false; + + timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait, + !test_bit(HIF_WAIT, &gh->gh_iflags) || + test_bit(GLF_DEMOTE, &ip->i_gl->gl_flags), + timeout); + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_dq(gh); + return false; + } + return true; +} + /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict @@ -1299,9 +1348,12 @@ static void gfs2_evict_inode(struct inode *inode) if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); gfs2_holder_mark_uninitialized(&gh); - goto alloc_failed; + goto out_delete; } + if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) + goto out; + /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) goto out; @@ -1315,6 +1367,8 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } + if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) + goto out_truncate; error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (error) goto out_truncate; @@ -1331,16 +1385,13 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink) goto out_truncate; -alloc_failed: +out_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, - &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) + if (!gfs2_upgrade_iopen_glock(inode)) { + gfs2_holder_uninit(&ip->i_iopen_gh); goto out_truncate; + } } if (S_ISDIR(inode->i_mode) && @@ -1368,6 +1419,7 @@ alloc_failed: that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); goto out_unlock; out_truncate: diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ffe840505082..a3dfa3aa87ad 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -37,7 +37,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); if (!tr) return -ENOMEM; @@ -52,6 +52,8 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); sb_start_intwrite(sdp->sd_vfs); @@ -65,7 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, fail: sb_end_intwrite(sdp->sd_vfs); - kfree(tr); + kmem_cache_free(gfs2_trans_cachep, tr); return error; } @@ -93,7 +95,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { gfs2_log_release(sdp, tr->tr_reserved); if (alloced) { - kfree(tr); + gfs2_trans_free(sdp, tr); sb_end_intwrite(sdp->sd_vfs); } return; @@ -109,7 +111,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_log_commit(sdp, tr); if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) - kfree(tr); + gfs2_trans_free(sdp, tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) @@ -276,3 +278,14 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) gfs2_log_unlock(sdp); } +void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + if (tr == NULL) + return; + + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_databuf)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_buf)); + kmem_cache_free(gfs2_trans_cachep, tr); +} diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 6071334de035..83199ce5a5c5 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -42,5 +42,6 @@ extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index aa087a5675af..1cd0328cae20 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -32,6 +32,7 @@ struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; struct kmem_cache *gfs2_qadata_cachep __read_mostly; +struct kmem_cache *gfs2_trans_cachep __read_mostly; mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index a3542560da6f..6d9157efe16c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -172,6 +172,7 @@ extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; +extern struct kmem_cache *gfs2_trans_cachep; extern mempool_t *gfs2_page_pool; extern struct workqueue_struct *gfs2_control_wq; diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..07e508e6691b 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,6 +171,12 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x00000008 #define GFS2_RGF_TRIMMED 0x00000010 +struct gfs2_inode_lvb { + __be32 ri_magic; + __be32 __pad; + __be64 ri_generation_deleted; +}; + struct gfs2_rgrp_lvb { __be32 rl_magic; __be32 rl_flags; |