raid5: relieve lock contention in get_active_stripe()

get_active_stripe() is the last place we have lock contention. It has two paths. One is stripe isn't found and new stripe is allocated, the other is stripe is found. The first path basically calls __find_stripe and init_stripe. It accesses conf->generation, conf->previous_raid_disks, conf->raid_disks, conf->prev_chunk_sectors, conf->chunk_sectors, conf->max_degraded, conf->prev_algo, conf->algorithm, the stripe_hashtbl and inactive_list. Except stripe_hashtbl and inactive_list, other fields are changed very rarely. With this patch, we split inactive_list and add new hash locks. Each free stripe belongs to a specific inactive list. Which inactive list is determined by stripe's lock_hash. Note, even a stripe hasn't a sector assigned, it has a lock_hash assigned. Stripe's inactive list is protected by a hash lock, which is determined by it's lock_hash too. The lock_hash is derivied from current stripe_hashtbl hash, which guarantees any stripe_hashtbl list will be assigned to a specific lock_hash, so we can use new hash lock to protect stripe_hashtbl list too. The goal of the new hash locks introduced is we can only use the new locks in the first path of get_active_stripe(). Since we have several hash locks, lock contention is relieved significantly. The first path of get_active_stripe() accesses other fields, since they are changed rarely, changing them now need take conf->device_lock and all hash locks. For a slow path, this isn't a problem. If we need lock device_lock and hash lock, we always lock hash lock first. The tricky part is release_stripe and friends. We need take device_lock first. Neil's suggestion is we put inactive stripes to a temporary list and readd it to inactive_list after device_lock is released. In this way, we add stripes to temporary list with device_lock hold and remove stripes from the list with hash lock hold. So we don't allow concurrent access to the temporary list, which means we need allocate temporary list for all participants of release_stripe. One downside is free stripes are maintained in their inactive list, they can't across between the lists. By default, we have total 256 stripes and 8 lists, so each list will have 32 stripes. It's possible one list has free stripe but other list hasn't. The chance should be rare because stripes allocation are even distributed. And we can always allocate more stripes for cache, several mega bytes memory isn't a big deal. This completely removes the lock contention of the first path of get_active_stripe(). It slows down the second code path a little bit though because we now need takes two locks, but since the hash lock isn't contended, the overhead should be quite small (several atomic instructions). The second path of get_active_stripe() (basically sequential write or big request size randwrite) still has lock contentions. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Shaohua Li <shli@kernel.org> 2013-11-14 05:16:17 +0100
committer: NeilBrown <neilb@suse.de> 2013-11-14 05:20:58 +0100
commit: 566c09c53455d7c4f1130928ef8071da1a24ea65 (patch)
tree: 8cc5de15ad5aa9027ca028844c16de3dc9532e95 /drivers/md/raid5.h
parent: wait: add wait_event_cmd() (diff)
download: linux-566c09c53455d7c4f1130928ef8071da1a24ea65.tar.xz
linux-566c09c53455d7c4f1130928ef8071da1a24ea65.zip
1 files changed, 14 insertions, 1 deletions
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2113ffa82c7a..a9e443a1116f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
 	short			pd_idx;		/* parity disk index */
 	short			qd_idx;		/* 'Q' disk index for raid6 */
 	short			ddf_layout;/* use DDF ordering to calculate Q */
+	short			hash_lock_index;
 	unsigned long		state;		/* state flags */
 	atomic_t		count;	      /* nr of active thread/requests */
 	int			bm_seq;	/* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
 struct r5worker {
 	struct work_struct work;
 	struct r5worker_group *group;
+	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 	bool working;
 };
 
@@ -382,6 +392,8 @@ struct r5worker_group {
 
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
+	/* only protect corresponding hash list and inactive_list */
+	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
 	struct mddev		*mddev;
 	int			chunk_sectors;
 	int			level, algorithm;
@@ -462,7 +474,7 @@ struct r5conf {
 	 * Free stripes pool
 	 */
 	atomic_t		active_stripes;
-	struct list_head	inactive_list;
+	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
@@ -477,6 +489,7 @@ struct r5conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 	struct r5worker_group	*worker_groups;
 	int			group_cnt;
 	int			worker_cnt_per_group;
author	Shaohua Li <shli@kernel.org>	2013-11-14 05:16:17 +0100
committer	NeilBrown <neilb@suse.de>	2013-11-14 05:20:58 +0100
commit	566c09c53455d7c4f1130928ef8071da1a24ea65 (patch)
tree	8cc5de15ad5aa9027ca028844c16de3dc9532e95 /drivers/md/raid5.h
parent	wait: add wait_event_cmd() (diff)
download	linux-566c09c53455d7c4f1130928ef8071da1a24ea65.tar.xz linux-566c09c53455d7c4f1130928ef8071da1a24ea65.zip