diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 23 | ||||
-rw-r--r-- | drivers/md/Makefile | 7 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-emc.c | 345 | ||||
-rw-r--r-- | drivers/md/dm-hw-handler.c | 213 | ||||
-rw-r--r-- | drivers/md/dm-hw-handler.h | 63 | ||||
-rw-r--r-- | drivers/md/dm-mpath-hp-sw.c | 247 | ||||
-rw-r--r-- | drivers/md/dm-mpath-rdac.c | 700 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 163 | ||||
-rw-r--r-- | drivers/md/dm-mpath.h | 1 | ||||
-rw-r--r-- | drivers/md/linear.c | 10 | ||||
-rw-r--r-- | drivers/md/md.c | 87 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 10 | ||||
-rw-r--r-- | drivers/md/raid1.c | 29 | ||||
-rw-r--r-- | drivers/md/raid10.c | 31 | ||||
-rw-r--r-- | drivers/md/raid5.c | 78 |
18 files changed, 281 insertions, 1747 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 610af916891e..07d92c11b5d8 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -252,27 +252,14 @@ config DM_ZERO config DM_MULTIPATH tristate "Multipath target" depends on BLK_DEV_DM + # nasty syntax but means make DM_MULTIPATH independent + # of SCSI_DH if the latter isn't defined but if + # it is, DM_MULTIPATH must depend on it. We get a build + # error if SCSI_DH=m and DM_MULTIPATH=y + depends on SCSI_DH || !SCSI_DH ---help--- Allow volume managers to support multipath hardware. -config DM_MULTIPATH_EMC - tristate "EMC CX/AX multipath support" - depends on DM_MULTIPATH && BLK_DEV_DM - ---help--- - Multipath support for EMC CX/AX series hardware. - -config DM_MULTIPATH_RDAC - tristate "LSI/Engenio RDAC multipath support (EXPERIMENTAL)" - depends on DM_MULTIPATH && BLK_DEV_DM && SCSI && EXPERIMENTAL - ---help--- - Multipath support for LSI/Engenio RDAC. - -config DM_MULTIPATH_HP - tristate "HP MSA multipath support (EXPERIMENTAL)" - depends on DM_MULTIPATH && BLK_DEV_DM && SCSI && EXPERIMENTAL - ---help--- - Multipath support for HP MSA (Active/Passive) series hardware. - config DM_DELAY tristate "I/O delaying target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 7be09eeea293..f1ef33dfd8cf 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -4,11 +4,9 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o -dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o +dm-multipath-objs := dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-mirror-objs := dm-raid1.o -dm-rdac-objs := dm-mpath-rdac.o -dm-hp-sw-objs := dm-mpath-hp-sw.o md-mod-objs := md.o bitmap.o raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ @@ -35,9 +33,6 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o -obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o -obj-$(CONFIG_DM_MULTIPATH_HP) += dm-hp-sw.o -obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o obj-$(CONFIG_DM_ZERO) += dm-zero.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index c14dacdacfac..b26927ce889c 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -203,17 +203,6 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) * bitmap file handling - read and write the bitmap file and its superblock */ -/* copy the pathname of a file to a buffer */ -char *file_path(struct file *file, char *buf, int count) -{ - if (!buf) - return NULL; - - buf = d_path(&file->f_path, buf, count); - - return IS_ERR(buf) ? NULL : buf; -} - /* * basic page I/O operations */ @@ -721,11 +710,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) if (bitmap->file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); if (path) - ptr = file_path(bitmap->file, path, PAGE_SIZE); + ptr = d_path(&bitmap->file->f_path, path, + PAGE_SIZE); + printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), ptr ? ptr : ""); + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); kfree(path); } else diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 835def11419d..ab6a61db63ce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -432,6 +432,7 @@ static int crypt_convert(struct crypt_config *cc, case 0: atomic_dec(&ctx->pending); ctx->sector++; + cond_resched(); continue; /* error */ diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c deleted file mode 100644 index 3ea5ad4b7805..000000000000 --- a/drivers/md/dm-emc.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - * - * Multipath support for EMC CLARiiON AX/CX-series hardware. - */ - -#include "dm.h" -#include "dm-hw-handler.h" -#include <scsi/scsi.h> -#include <scsi/scsi_cmnd.h> - -#define DM_MSG_PREFIX "multipath emc" - -struct emc_handler { - spinlock_t lock; - - /* Whether we should send the short trespass command (FC-series) - * or the long version (default for AX/CX CLARiiON arrays). */ - unsigned short_trespass; - /* Whether or not to honor SCSI reservations when initiating a - * switch-over. Default: Don't. */ - unsigned hr; - - unsigned char sense[SCSI_SENSE_BUFFERSIZE]; -}; - -#define TRESPASS_PAGE 0x22 -#define EMC_FAILOVER_TIMEOUT (60 * HZ) - -/* Code borrowed from dm-lsi-rdac by Mike Christie */ - -static inline void free_bio(struct bio *bio) -{ - __free_page(bio->bi_io_vec[0].bv_page); - bio_put(bio); -} - -static void emc_endio(struct bio *bio, int error) -{ - struct dm_path *path = bio->bi_private; - - /* We also need to look at the sense keys here whether or not to - * switch to the next PG etc. - * - * For now simple logic: either it works or it doesn't. - */ - if (error) - dm_pg_init_complete(path, MP_FAIL_PATH); - else - dm_pg_init_complete(path, 0); - - /* request is freed in block layer */ - free_bio(bio); -} - -static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size) -{ - struct bio *bio; - struct page *page; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) { - DMERR("get_failover_bio: bio_alloc() failed."); - return NULL; - } - - bio->bi_rw |= (1 << BIO_RW); - bio->bi_bdev = path->dev->bdev; - bio->bi_sector = 0; - bio->bi_private = path; - bio->bi_end_io = emc_endio; - - page = alloc_page(GFP_ATOMIC); - if (!page) { - DMERR("get_failover_bio: alloc_page() failed."); - bio_put(bio); - return NULL; - } - - if (bio_add_page(bio, page, data_size, 0) != data_size) { - DMERR("get_failover_bio: bio_add_page() failed."); - __free_page(page); - bio_put(bio); - return NULL; - } - - return bio; -} - -static struct request *get_failover_req(struct emc_handler *h, - struct bio *bio, struct dm_path *path) -{ - struct request *rq; - struct block_device *bdev = bio->bi_bdev; - struct request_queue *q = bdev_get_queue(bdev); - - /* FIXME: Figure out why it fails with GFP_ATOMIC. */ - rq = blk_get_request(q, WRITE, __GFP_WAIT); - if (!rq) { - DMERR("get_failover_req: blk_get_request failed"); - return NULL; - } - - blk_rq_append_bio(q, rq, bio); - - rq->sense = h->sense; - memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); - rq->sense_len = 0; - - rq->timeout = EMC_FAILOVER_TIMEOUT; - rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; - - return rq; -} - -static struct request *emc_trespass_get(struct emc_handler *h, - struct dm_path *path) -{ - struct bio *bio; - struct request *rq; - unsigned char *page22; - unsigned char long_trespass_pg[] = { - 0, 0, 0, 0, - TRESPASS_PAGE, /* Page code */ - 0x09, /* Page length - 2 */ - h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ - 0xff, 0xff, /* Trespass target */ - 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */ - }; - unsigned char short_trespass_pg[] = { - 0, 0, 0, 0, - TRESPASS_PAGE, /* Page code */ - 0x02, /* Page length - 2 */ - h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ - 0xff, /* Trespass target */ - }; - unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) : - sizeof(long_trespass_pg); - - /* get bio backing */ - if (data_size > PAGE_SIZE) - /* this should never happen */ - return NULL; - - bio = get_failover_bio(path, data_size); - if (!bio) { - DMERR("emc_trespass_get: no bio"); - return NULL; - } - - page22 = (unsigned char *)bio_data(bio); - memset(page22, 0, data_size); - - memcpy(page22, h->short_trespass ? - short_trespass_pg : long_trespass_pg, data_size); - - /* get request for block layer packet command */ - rq = get_failover_req(h, bio, path); - if (!rq) { - DMERR("emc_trespass_get: no rq"); - free_bio(bio); - return NULL; - } - - /* Prepare the command. */ - rq->cmd[0] = MODE_SELECT; - rq->cmd[1] = 0x10; - rq->cmd[4] = data_size; - rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); - - return rq; -} - -static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, - struct dm_path *path) -{ - struct request *rq; - struct request_queue *q = bdev_get_queue(path->dev->bdev); - - /* - * We can either blindly init the pg (then look at the sense), - * or we can send some commands to get the state here (then - * possibly send the fo cmnd), or we can also have the - * initial state passed into us and then get an update here. - */ - if (!q) { - DMINFO("emc_pg_init: no queue"); - goto fail_path; - } - - /* FIXME: The request should be pre-allocated. */ - rq = emc_trespass_get(hwh->context, path); - if (!rq) { - DMERR("emc_pg_init: no rq"); - goto fail_path; - } - - DMINFO("emc_pg_init: sending switch-over command"); - elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); - return; - -fail_path: - dm_pg_init_complete(path, MP_FAIL_PATH); -} - -static struct emc_handler *alloc_emc_handler(void) -{ - struct emc_handler *h = kzalloc(sizeof(*h), GFP_KERNEL); - - if (h) - spin_lock_init(&h->lock); - - return h; -} - -static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) -{ - struct emc_handler *h; - unsigned hr, short_trespass; - - if (argc == 0) { - /* No arguments: use defaults */ - hr = 0; - short_trespass = 0; - } else if (argc != 2) { - DMWARN("incorrect number of arguments"); - return -EINVAL; - } else { - if ((sscanf(argv[0], "%u", &short_trespass) != 1) - || (short_trespass > 1)) { - DMWARN("invalid trespass mode selected"); - return -EINVAL; - } - - if ((sscanf(argv[1], "%u", &hr) != 1) - || (hr > 1)) { - DMWARN("invalid honor reservation flag selected"); - return -EINVAL; - } - } - - h = alloc_emc_handler(); - if (!h) - return -ENOMEM; - - hwh->context = h; - - if ((h->short_trespass = short_trespass)) - DMWARN("short trespass command will be send"); - else - DMWARN("long trespass command will be send"); - - if ((h->hr = hr)) - DMWARN("honor reservation bit will be set"); - else - DMWARN("honor reservation bit will not be set (default)"); - - return 0; -} - -static void emc_destroy(struct hw_handler *hwh) -{ - struct emc_handler *h = (struct emc_handler *) hwh->context; - - kfree(h); - hwh->context = NULL; -} - -static unsigned emc_error(struct hw_handler *hwh, struct bio *bio) -{ - /* FIXME: Patch from axboe still missing */ -#if 0 - int sense; - - if (bio->bi_error & BIO_SENSE) { - sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */ - - if (sense == 0x020403) { - /* LUN Not Ready - Manual Intervention Required - * indicates this is a passive path. - * - * FIXME: However, if this is seen and EVPD C0 - * indicates that this is due to a NDU in - * progress, we should set FAIL_PATH too. - * This indicates we might have to do a SCSI - * inquiry in the end_io path. Ugh. */ - return MP_BYPASS_PG | MP_RETRY_IO; - } else if (sense == 0x052501) { - /* An array based copy is in progress. Do not - * fail the path, do not bypass to another PG, - * do not retry. Fail the IO immediately. - * (Actually this is the same conclusion as in - * the default handler, but lets make sure.) */ - return 0; - } else if (sense == 0x062900) { - /* Unit Attention Code. This is the first IO - * to the new path, so just retry. */ - return MP_RETRY_IO; - } - } -#endif - - /* Try default handler */ - return dm_scsi_err_handler(hwh, bio); -} - -static struct hw_handler_type emc_hwh = { - .name = "emc", - .module = THIS_MODULE, - .create = emc_create, - .destroy = emc_destroy, - .pg_init = emc_pg_init, - .error = emc_error, -}; - -static int __init dm_emc_init(void) -{ - int r = dm_register_hw_handler(&emc_hwh); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version 0.0.3 loaded"); - - return r; -} - -static void __exit dm_emc_exit(void) -{ - int r = dm_unregister_hw_handler(&emc_hwh); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_emc_init); -module_exit(dm_emc_exit); - -MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath"); -MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>"); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c deleted file mode 100644 index 2ee84d8aa0bf..000000000000 --- a/drivers/md/dm-hw-handler.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - * - * Multipath hardware handler registration. - */ - -#include "dm.h" -#include "dm-hw-handler.h" - -#include <linux/slab.h> - -struct hwh_internal { - struct hw_handler_type hwht; - - struct list_head list; - long use; -}; - -#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht) - -static LIST_HEAD(_hw_handlers); -static DECLARE_RWSEM(_hwh_lock); - -static struct hwh_internal *__find_hw_handler_type(const char *name) -{ - struct hwh_internal *hwhi; - - list_for_each_entry(hwhi, &_hw_handlers, list) { - if (!strcmp(name, hwhi->hwht.name)) - return hwhi; - } - - return NULL; -} - -static struct hwh_internal *get_hw_handler(const char *name) -{ - struct hwh_internal *hwhi; - - down_read(&_hwh_lock); - hwhi = __find_hw_handler_type(name); - if (hwhi) { - if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module)) - hwhi = NULL; - else - hwhi->use++; - } - up_read(&_hwh_lock); - - return hwhi; -} - -struct hw_handler_type *dm_get_hw_handler(const char *name) -{ - struct hwh_internal *hwhi; - - if (!name) - return NULL; - - hwhi = get_hw_handler(name); - if (!hwhi) { - request_module("dm-%s", name); - hwhi = get_hw_handler(name); - } - - return hwhi ? &hwhi->hwht : NULL; -} - -void dm_put_hw_handler(struct hw_handler_type *hwht) -{ - struct hwh_internal *hwhi; - - if (!hwht) - return; - - down_read(&_hwh_lock); - hwhi = __find_hw_handler_type(hwht->name); - if (!hwhi) - goto out; - - if (--hwhi->use == 0) - module_put(hwhi->hwht.module); - - BUG_ON(hwhi->use < 0); - - out: - up_read(&_hwh_lock); -} - -static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht) -{ - struct hwh_internal *hwhi = kzalloc(sizeof(*hwhi), GFP_KERNEL); - - if (hwhi) - hwhi->hwht = *hwht; - - return hwhi; -} - -int dm_register_hw_handler(struct hw_handler_type *hwht) -{ - int r = 0; - struct hwh_internal *hwhi = _alloc_hw_handler(hwht); - - if (!hwhi) - return -ENOMEM; - - down_write(&_hwh_lock); - - if (__find_hw_handler_type(hwht->name)) { - kfree(hwhi); - r = -EEXIST; - } else - list_add(&hwhi->list, &_hw_handlers); - - up_write(&_hwh_lock); - - return r; -} - -int dm_unregister_hw_handler(struct hw_handler_type *hwht) -{ - struct hwh_internal *hwhi; - - down_write(&_hwh_lock); - - hwhi = __find_hw_handler_type(hwht->name); - if (!hwhi) { - up_write(&_hwh_lock); - return -EINVAL; - } - - if (hwhi->use) { - up_write(&_hwh_lock); - return -ETXTBSY; - } - - list_del(&hwhi->list); - - up_write(&_hwh_lock); - - kfree(hwhi); - - return 0; -} - -unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio) -{ -#if 0 - int sense_key, asc, ascq; - - if (bio->bi_error & BIO_SENSE) { - /* FIXME: This is just an initial guess. */ - /* key / asc / ascq */ - sense_key = (bio->bi_error >> 16) & 0xff; - asc = (bio->bi_error >> 8) & 0xff; - ascq = bio->bi_error & 0xff; - - switch (sense_key) { - /* This block as a whole comes from the device. - * So no point retrying on another path. */ - case 0x03: /* Medium error */ - case 0x05: /* Illegal request */ - case 0x07: /* Data protect */ - case 0x08: /* Blank check */ - case 0x0a: /* copy aborted */ - case 0x0c: /* obsolete - no clue ;-) */ - case 0x0d: /* volume overflow */ - case 0x0e: /* data miscompare */ - case 0x0f: /* reserved - no idea either. */ - return MP_ERROR_IO; - - /* For these errors it's unclear whether they - * come from the device or the controller. - * So just lets try a different path, and if - * it eventually succeeds, user-space will clear - * the paths again... */ - case 0x02: /* Not ready */ - case 0x04: /* Hardware error */ - case 0x09: /* vendor specific */ - case 0x0b: /* Aborted command */ - return MP_FAIL_PATH; - - case 0x06: /* Unit attention - might want to decode */ - if (asc == 0x04 && ascq == 0x01) - /* "Unit in the process of - * becoming ready" */ - return 0; - return MP_FAIL_PATH; - - /* FIXME: For Unit Not Ready we may want - * to have a generic pg activation - * feature (START_UNIT). */ - - /* Should these two ever end up in the - * error path? I don't think so. */ - case 0x00: /* No sense */ - case 0x01: /* Recovered error */ - return 0; - } - } -#endif - - /* We got no idea how to decode the other kinds of errors -> - * assume generic error condition. */ - return MP_FAIL_PATH; -} - -EXPORT_SYMBOL_GPL(dm_register_hw_handler); -EXPORT_SYMBOL_GPL(dm_unregister_hw_handler); -EXPORT_SYMBOL_GPL(dm_scsi_err_handler); diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h deleted file mode 100644 index 46809dcb121a..000000000000 --- a/drivers/md/dm-hw-handler.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - * - * Multipath hardware handler registration. - */ - -#ifndef DM_HW_HANDLER_H -#define DM_HW_HANDLER_H - -#include <linux/device-mapper.h> - -#include "dm-mpath.h" - -struct hw_handler_type; -struct hw_handler { - struct hw_handler_type *type; - struct mapped_device *md; - void *context; -}; - -/* - * Constructs a hardware handler object, takes custom arguments - */ -/* Information about a hardware handler type */ -struct hw_handler_type { - char *name; - struct module *module; - - int (*create) (struct hw_handler *handler, unsigned int argc, - char **argv); - void (*destroy) (struct hw_handler *hwh); - - void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, - struct dm_path *path); - unsigned (*error) (struct hw_handler *hwh, struct bio *bio); - int (*status) (struct hw_handler *hwh, status_type_t type, - char *result, unsigned int maxlen); -}; - -/* Register a hardware handler */ -int dm_register_hw_handler(struct hw_handler_type *type); - -/* Unregister a hardware handler */ -int dm_unregister_hw_handler(struct hw_handler_type *type); - -/* Returns a registered hardware handler type */ -struct hw_handler_type *dm_get_hw_handler(const char *name); - -/* Releases a hardware handler */ -void dm_put_hw_handler(struct hw_handler_type *hwht); - -/* Default err function */ -unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio); - -/* Error flags for err and dm_pg_init_complete */ -#define MP_FAIL_PATH 1 -#define MP_BYPASS_PG 2 -#define MP_ERROR_IO 4 /* Don't retry this I/O */ -#define MP_RETRY 8 - -#endif diff --git a/drivers/md/dm-mpath-hp-sw.c b/drivers/md/dm-mpath-hp-sw.c deleted file mode 100644 index b63a0ab37c53..000000000000 --- a/drivers/md/dm-mpath-hp-sw.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (C) 2005 Mike Christie, All rights reserved. - * Copyright (C) 2007 Red Hat, Inc. All rights reserved. - * Authors: Mike Christie - * Dave Wysochanski - * - * This file is released under the GPL. - * - * This module implements the specific path activation code for - * HP StorageWorks and FSC FibreCat Asymmetric (Active/Passive) - * storage arrays. - * These storage arrays have controller-based failover, not - * LUN-based failover. However, LUN-based failover is the design - * of dm-multipath. Thus, this module is written for LUN-based failover. - */ -#include <linux/blkdev.h> -#include <linux/list.h> -#include <linux/types.h> -#include <scsi/scsi.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_dbg.h> - -#include "dm.h" -#include "dm-hw-handler.h" - -#define DM_MSG_PREFIX "multipath hp-sw" -#define DM_HP_HWH_NAME "hp-sw" -#define DM_HP_HWH_VER "1.0.0" - -struct hp_sw_context { - unsigned char sense[SCSI_SENSE_BUFFERSIZE]; -}; - -/* - * hp_sw_error_is_retryable - Is an HP-specific check condition retryable? - * @req: path activation request - * - * Examine error codes of request and determine whether the error is retryable. - * Some error codes are already retried by scsi-ml (see - * scsi_decide_disposition), but some HP specific codes are not. - * The intent of this routine is to supply the logic for the HP specific - * check conditions. - * - * Returns: - * 1 - command completed with retryable error - * 0 - command completed with non-retryable error - * - * Possible optimizations - * 1. More hardware-specific error codes - */ -static int hp_sw_error_is_retryable(struct request *req) -{ - /* - * NOT_READY is known to be retryable - * For now we just dump out the sense data and call it retryable - */ - if (status_byte(req->errors) == CHECK_CONDITION) - __scsi_print_sense(DM_HP_HWH_NAME, req->sense, req->sense_len); - - /* - * At this point we don't have complete information about all the error - * codes from this hardware, so we are just conservative and retry - * when in doubt. - */ - return 1; -} - -/* - * hp_sw_end_io - Completion handler for HP path activation. - * @req: path activation request - * @error: scsi-ml error - * - * Check sense data, free request structure, and notify dm that - * pg initialization has completed. - * - * Context: scsi-ml softirq - * - */ -static void hp_sw_end_io(struct request *req, int error) -{ - struct dm_path *path = req->end_io_data; - unsigned err_flags = 0; - - if (!error) { - DMDEBUG("%s path activation command - success", - path->dev->name); - goto out; - } - - if (hp_sw_error_is_retryable(req)) { - DMDEBUG("%s path activation command - retry", - path->dev->name); - err_flags = MP_RETRY; - goto out; - } - - DMWARN("%s path activation fail - error=0x%x", - path->dev->name, error); - err_flags = MP_FAIL_PATH; - -out: - req->end_io_data = NULL; - __blk_put_request(req->q, req); - dm_pg_init_complete(path, err_flags); -} - -/* - * hp_sw_get_request - Allocate an HP specific path activation request - * @path: path on which request will be sent (needed for request queue) - * - * The START command is used for path activation request. - * These arrays are controller-based failover, not LUN based. - * One START command issued to a single path will fail over all - * LUNs for the same controller. - * - * Possible optimizations - * 1. Make timeout configurable - * 2. Preallocate request - */ -static struct request *hp_sw_get_request(struct dm_path *path) -{ - struct request *req; - struct block_device *bdev = path->dev->bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct hp_sw_context *h = path->hwhcontext; - - req = blk_get_request(q, WRITE, GFP_NOIO); - if (!req) - goto out; - - req->timeout = 60 * HZ; - - req->errors = 0; - req->cmd_type = REQ_TYPE_BLOCK_PC; - req->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; - req->end_io_data = path; - req->sense = h->sense; - memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE); - - req->cmd[0] = START_STOP; - req->cmd[4] = 1; - req->cmd_len = COMMAND_SIZE(req->cmd[0]); - -out: - return req; -} - -/* - * hp_sw_pg_init - HP path activation implementation. - * @hwh: hardware handler specific data - * @bypassed: unused; is the path group bypassed? (see dm-mpath.c) - * @path: path to send initialization command - * - * Send an HP-specific path activation command on 'path'. - * Do not try to optimize in any way, just send the activation command. - * More than one path activation command may be sent to the same controller. - * This seems to work fine for basic failover support. - * - * Possible optimizations - * 1. Detect an in-progress activation request and avoid submitting another one - * 2. Model the controller and only send a single activation request at a time - * 3. Determine the state of a path before sending an activation request - * - * Context: kmpathd (see process_queued_ios() in dm-mpath.c) - */ -static void hp_sw_pg_init(struct hw_handler *hwh, unsigned bypassed, - struct dm_path *path) -{ - struct request *req; - struct hp_sw_context *h; - - path->hwhcontext = hwh->context; - h = hwh->context; - - req = hp_sw_get_request(path); - if (!req) { - DMERR("%s path activation command - allocation fail", - path->dev->name); - goto retry; - } - - DMDEBUG("%s path activation command - sent", path->dev->name); - - blk_execute_rq_nowait(req->q, NULL, req, 1, hp_sw_end_io); - return; - -retry: - dm_pg_init_complete(path, MP_RETRY); -} - -static int hp_sw_create(struct hw_handler *hwh, unsigned argc, char **argv) -{ - struct hp_sw_context *h; - - h = kmalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - hwh->context = h; - - return 0; -} - -static void hp_sw_destroy(struct hw_handler *hwh) -{ - struct hp_sw_context *h = hwh->context; - - kfree(h); -} - -static struct hw_handler_type hp_sw_hwh = { - .name = DM_HP_HWH_NAME, - .module = THIS_MODULE, - .create = hp_sw_create, - .destroy = hp_sw_destroy, - .pg_init = hp_sw_pg_init, -}; - -static int __init hp_sw_init(void) -{ - int r; - - r = dm_register_hw_handler(&hp_sw_hwh); - if (r < 0) - DMERR("register failed %d", r); - else - DMINFO("version " DM_HP_HWH_VER " loaded"); - - return r; -} - -static void __exit hp_sw_exit(void) -{ - int r; - - r = dm_unregister_hw_handler(&hp_sw_hwh); - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(hp_sw_init); -module_exit(hp_sw_exit); - -MODULE_DESCRIPTION("DM Multipath HP StorageWorks / FSC FibreCat (A/P) support"); -MODULE_AUTHOR("Mike Christie, Dave Wysochanski <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(DM_HP_HWH_VER); diff --git a/drivers/md/dm-mpath-rdac.c b/drivers/md/dm-mpath-rdac.c deleted file mode 100644 index 95e77734880a..000000000000 --- a/drivers/md/dm-mpath-rdac.c +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Engenio/LSI RDAC DM HW handler - * - * Copyright (C) 2005 Mike Christie. All rights reserved. - * Copyright (C) Chandra Seetharaman, IBM Corp. 2007 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - */ -#include <scsi/scsi.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_eh.h> - -#define DM_MSG_PREFIX "multipath rdac" - -#include "dm.h" -#include "dm-hw-handler.h" - -#define RDAC_DM_HWH_NAME "rdac" -#define RDAC_DM_HWH_VER "0.4" - -/* - * LSI mode page stuff - * - * These struct definitions and the forming of the - * mode page were taken from the LSI RDAC 2.4 GPL'd - * driver, and then converted to Linux conventions. - */ -#define RDAC_QUIESCENCE_TIME 20; -/* - * Page Codes - */ -#define RDAC_PAGE_CODE_REDUNDANT_CONTROLLER 0x2c - -/* - * Controller modes definitions - */ -#define RDAC_MODE_TRANSFER_ALL_LUNS 0x01 -#define RDAC_MODE_TRANSFER_SPECIFIED_LUNS 0x02 - -/* - * RDAC Options field - */ -#define RDAC_FORCED_QUIESENCE 0x02 - -#define RDAC_FAILOVER_TIMEOUT (60 * HZ) - -struct rdac_mode_6_hdr { - u8 data_len; - u8 medium_type; - u8 device_params; - u8 block_desc_len; -}; - -struct rdac_mode_10_hdr { - u16 data_len; - u8 medium_type; - u8 device_params; - u16 reserved; - u16 block_desc_len; -}; - -struct rdac_mode_common { - u8 controller_serial[16]; - u8 alt_controller_serial[16]; - u8 rdac_mode[2]; - u8 alt_rdac_mode[2]; - u8 quiescence_timeout; - u8 rdac_options; -}; - -struct rdac_pg_legacy { - struct rdac_mode_6_hdr hdr; - u8 page_code; - u8 page_len; - struct rdac_mode_common common; -#define MODE6_MAX_LUN 32 - u8 lun_table[MODE6_MAX_LUN]; - u8 reserved2[32]; - u8 reserved3; - u8 reserved4; -}; - -struct rdac_pg_expanded { - struct rdac_mode_10_hdr hdr; - u8 page_code; - u8 subpage_code; - u8 page_len[2]; - struct rdac_mode_common common; - u8 lun_table[256]; - u8 reserved3; - u8 reserved4; -}; - -struct c9_inquiry { - u8 peripheral_info; - u8 page_code; /* 0xC9 */ - u8 reserved1; - u8 page_len; - u8 page_id[4]; /* "vace" */ - u8 avte_cvp; - u8 path_prio; - u8 reserved2[38]; -}; - -#define SUBSYS_ID_LEN 16 -#define SLOT_ID_LEN 2 - -struct c4_inquiry { - u8 peripheral_info; - u8 page_code; /* 0xC4 */ - u8 reserved1; - u8 page_len; - u8 page_id[4]; /* "subs" */ - u8 subsys_id[SUBSYS_ID_LEN]; - u8 revision[4]; - u8 slot_id[SLOT_ID_LEN]; - u8 reserved[2]; -}; - -struct rdac_controller { - u8 subsys_id[SUBSYS_ID_LEN]; - u8 slot_id[SLOT_ID_LEN]; - int use_10_ms; - struct kref kref; - struct list_head node; /* list of all controllers */ - spinlock_t lock; - int submitted; - struct list_head cmd_list; /* list of commands to be submitted */ - union { - struct rdac_pg_legacy legacy; - struct rdac_pg_expanded expanded; - } mode_select; -}; -struct c8_inquiry { - u8 peripheral_info; - u8 page_code; /* 0xC8 */ - u8 reserved1; - u8 page_len; - u8 page_id[4]; /* "edid" */ - u8 reserved2[3]; - u8 vol_uniq_id_len; - u8 vol_uniq_id[16]; - u8 vol_user_label_len; - u8 vol_user_label[60]; - u8 array_uniq_id_len; - u8 array_unique_id[16]; - u8 array_user_label_len; - u8 array_user_label[60]; - u8 lun[8]; -}; - -struct c2_inquiry { - u8 peripheral_info; - u8 page_code; /* 0xC2 */ - u8 reserved1; - u8 page_len; - u8 page_id[4]; /* "swr4" */ - u8 sw_version[3]; - u8 sw_date[3]; - u8 features_enabled; - u8 max_lun_supported; - u8 partitions[239]; /* Total allocation length should be 0xFF */ -}; - -struct rdac_handler { - struct list_head entry; /* list waiting to submit MODE SELECT */ - unsigned timeout; - struct rdac_controller *ctlr; -#define UNINITIALIZED_LUN (1 << 8) - unsigned lun; - unsigned char sense[SCSI_SENSE_BUFFERSIZE]; - struct dm_path *path; - struct work_struct work; -#define SEND_C2_INQUIRY 1 -#define SEND_C4_INQUIRY 2 -#define SEND_C8_INQUIRY 3 -#define SEND_C9_INQUIRY 4 -#define SEND_MODE_SELECT 5 - int cmd_to_send; - union { - struct c2_inquiry c2; - struct c4_inquiry c4; - struct c8_inquiry c8; - struct c9_inquiry c9; - } inq; -}; - -static LIST_HEAD(ctlr_list); -static DEFINE_SPINLOCK(list_lock); -static struct workqueue_struct *rdac_wkqd; - -static inline int had_failures(struct request *req, int error) -{ - return (error || host_byte(req->errors) != DID_OK || - msg_byte(req->errors) != COMMAND_COMPLETE); -} - -static void rdac_resubmit_all(struct rdac_handler *h) -{ - struct rdac_controller *ctlr = h->ctlr; - struct rdac_handler *tmp, *h1; - - spin_lock(&ctlr->lock); - list_for_each_entry_safe(h1, tmp, &ctlr->cmd_list, entry) { - h1->cmd_to_send = SEND_C9_INQUIRY; - queue_work(rdac_wkqd, &h1->work); - list_del(&h1->entry); - } - ctlr->submitted = 0; - spin_unlock(&ctlr->lock); -} - -static void mode_select_endio(struct request *req, int error) -{ - struct rdac_handler *h = req->end_io_data; - struct scsi_sense_hdr sense_hdr; - int sense = 0, fail = 0; - - if (had_failures(req, error)) { - fail = 1; - goto failed; - } - - if (status_byte(req->errors) == CHECK_CONDITION) { - scsi_normalize_sense(req->sense, SCSI_SENSE_BUFFERSIZE, - &sense_hdr); - sense = (sense_hdr.sense_key << 16) | (sense_hdr.asc << 8) | - sense_hdr.ascq; - /* If it is retryable failure, submit the c9 inquiry again */ - if (sense == 0x59136 || sense == 0x68b02 || sense == 0xb8b02 || - sense == 0x62900) { - /* 0x59136 - Command lock contention - * 0x[6b]8b02 - Quiesense in progress or achieved - * 0x62900 - Power On, Reset, or Bus Device Reset - */ - h->cmd_to_send = SEND_C9_INQUIRY; - queue_work(rdac_wkqd, &h->work); - goto done; - } - if (sense) - DMINFO("MODE_SELECT failed on %s with sense 0x%x", - h->path->dev->name, sense); - } -failed: - if (fail || sense) - dm_pg_init_complete(h->path, MP_FAIL_PATH); - else - dm_pg_init_complete(h->path, 0); - -done: - rdac_resubmit_all(h); - __blk_put_request(req->q, req); -} - -static struct request *get_rdac_req(struct rdac_handler *h, - void *buffer, unsigned buflen, int rw) -{ - struct request *rq; - struct request_queue *q = bdev_get_queue(h->path->dev->bdev); - - rq = blk_get_request(q, rw, GFP_KERNEL); - - if (!rq) { - DMINFO("get_rdac_req: blk_get_request failed"); - return NULL; - } - - if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_KERNEL)) { - blk_put_request(rq); - DMINFO("get_rdac_req: blk_rq_map_kern failed"); - return NULL; - } - - rq->sense = h->sense; - memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); - rq->sense_len = 0; - - rq->end_io_data = h; - rq->timeout = h->timeout; - rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; - return rq; -} - -static struct request *rdac_failover_get(struct rdac_handler *h) -{ - struct request *rq; - struct rdac_mode_common *common; - unsigned data_size; - - if (h->ctlr->use_10_ms) { - struct rdac_pg_expanded *rdac_pg; - - data_size = sizeof(struct rdac_pg_expanded); - rdac_pg = &h->ctlr->mode_select.expanded; - memset(rdac_pg, 0, data_size); - common = &rdac_pg->common; - rdac_pg->page_code = RDAC_PAGE_CODE_REDUNDANT_CONTROLLER + 0x40; - rdac_pg->subpage_code = 0x1; - rdac_pg->page_len[0] = 0x01; - rdac_pg->page_len[1] = 0x28; - rdac_pg->lun_table[h->lun] = 0x81; - } else { - struct rdac_pg_legacy *rdac_pg; - - data_size = sizeof(struct rdac_pg_legacy); - rdac_pg = &h->ctlr->mode_select.legacy; - memset(rdac_pg, 0, data_size); - common = &rdac_pg->common; - rdac_pg->page_code = RDAC_PAGE_CODE_REDUNDANT_CONTROLLER; - rdac_pg->page_len = 0x68; - rdac_pg->lun_table[h->lun] = 0x81; - } - common->rdac_mode[1] = RDAC_MODE_TRANSFER_SPECIFIED_LUNS; - common->quiescence_timeout = RDAC_QUIESCENCE_TIME; - common->rdac_options = RDAC_FORCED_QUIESENCE; - - /* get request for block layer packet command */ - rq = get_rdac_req(h, &h->ctlr->mode_select, data_size, WRITE); - if (!rq) { - DMERR("rdac_failover_get: no rq"); - return NULL; - } - - /* Prepare the command. */ - if (h->ctlr->use_10_ms) { - rq->cmd[0] = MODE_SELECT_10; - rq->cmd[7] = data_size >> 8; - rq->cmd[8] = data_size & 0xff; - } else { - rq->cmd[0] = MODE_SELECT; - rq->cmd[4] = data_size; - } - rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); - - return rq; -} - -/* Acquires h->ctlr->lock */ -static void submit_mode_select(struct rdac_handler *h) -{ - struct request *rq; - struct request_queue *q = bdev_get_queue(h->path->dev->bdev); - - spin_lock(&h->ctlr->lock); - if (h->ctlr->submitted) { - list_add(&h->entry, &h->ctlr->cmd_list); - goto drop_lock; - } - - if (!q) { - DMINFO("submit_mode_select: no queue"); - goto fail_path; - } - - rq = rdac_failover_get(h); - if (!rq) { - DMERR("submit_mode_select: no rq"); - goto fail_path; - } - - DMINFO("queueing MODE_SELECT command on %s", h->path->dev->name); - - blk_execute_rq_nowait(q, NULL, rq, 1, mode_select_endio); - h->ctlr->submitted = 1; - goto drop_lock; -fail_path: - dm_pg_init_complete(h->path, MP_FAIL_PATH); -drop_lock: - spin_unlock(&h->ctlr->lock); -} - -static void release_ctlr(struct kref *kref) -{ - struct rdac_controller *ctlr; - ctlr = container_of(kref, struct rdac_controller, kref); - - spin_lock(&list_lock); - list_del(&ctlr->node); - spin_unlock(&list_lock); - kfree(ctlr); -} - -static struct rdac_controller *get_controller(u8 *subsys_id, u8 *slot_id) -{ - struct rdac_controller *ctlr, *tmp; - - spin_lock(&list_lock); - - list_for_each_entry(tmp, &ctlr_list, node) { - if ((memcmp(tmp->subsys_id, subsys_id, SUBSYS_ID_LEN) == 0) && - (memcmp(tmp->slot_id, slot_id, SLOT_ID_LEN) == 0)) { - kref_get(&tmp->kref); - spin_unlock(&list_lock); - return tmp; - } - } - ctlr = kmalloc(sizeof(*ctlr), GFP_ATOMIC); - if (!ctlr) - goto done; - - /* initialize fields of controller */ - memcpy(ctlr->subsys_id, subsys_id, SUBSYS_ID_LEN); - memcpy(ctlr->slot_id, slot_id, SLOT_ID_LEN); - kref_init(&ctlr->kref); - spin_lock_init(&ctlr->lock); - ctlr->submitted = 0; - ctlr->use_10_ms = -1; - INIT_LIST_HEAD(&ctlr->cmd_list); - list_add(&ctlr->node, &ctlr_list); -done: - spin_unlock(&list_lock); - return ctlr; -} - -static void c4_endio(struct request *req, int error) -{ - struct rdac_handler *h = req->end_io_data; - struct c4_inquiry *sp; - - if (had_failures(req, error)) { - dm_pg_init_complete(h->path, MP_FAIL_PATH); - goto done; - } - - sp = &h->inq.c4; - - h->ctlr = get_controller(sp->subsys_id, sp->slot_id); - - if (h->ctlr) { - h->cmd_to_send = SEND_C9_INQUIRY; - queue_work(rdac_wkqd, &h->work); - } else - dm_pg_init_complete(h->path, MP_FAIL_PATH); -done: - __blk_put_request(req->q, req); -} - -static void c2_endio(struct request *req, int error) -{ - struct rdac_handler *h = req->end_io_data; - struct c2_inquiry *sp; - - if (had_failures(req, error)) { - dm_pg_init_complete(h->path, MP_FAIL_PATH); - goto done; - } - - sp = &h->inq.c2; - - /* If more than MODE6_MAX_LUN luns are supported, use mode select 10 */ - if (sp->max_lun_supported >= MODE6_MAX_LUN) - h->ctlr->use_10_ms = 1; - else - h->ctlr->use_10_ms = 0; - - h->cmd_to_send = SEND_MODE_SELECT; - queue_work(rdac_wkqd, &h->work); -done: - __blk_put_request(req->q, req); -} - -static void c9_endio(struct request *req, int error) -{ - struct rdac_handler *h = req->end_io_data; - struct c9_inquiry *sp; - - if (had_failures(req, error)) { - dm_pg_init_complete(h->path, MP_FAIL_PATH); - goto done; - } - - /* We need to look at the sense keys here to take clear action. - * For now simple logic: If the host is in AVT mode or if controller - * owns the lun, return dm_pg_init_complete(), otherwise submit - * MODE SELECT. - */ - sp = &h->inq.c9; - - /* If in AVT mode, return success */ - if ((sp->avte_cvp >> 7) == 0x1) { - dm_pg_init_complete(h->path, 0); - goto done; - } - - /* If the controller on this path owns the LUN, return success */ - if (sp->avte_cvp & 0x1) { - dm_pg_init_complete(h->path, 0); - goto done; - } - - if (h->ctlr) { - if (h->ctlr->use_10_ms == -1) - h->cmd_to_send = SEND_C2_INQUIRY; - else - h->cmd_to_send = SEND_MODE_SELECT; - } else - h->cmd_to_send = SEND_C4_INQUIRY; - queue_work(rdac_wkqd, &h->work); -done: - __blk_put_request(req->q, req); -} - -static void c8_endio(struct request *req, int error) -{ - struct rdac_handler *h = req->end_io_data; - struct c8_inquiry *sp; - - if (had_failures(req, error)) { - dm_pg_init_complete(h->path, MP_FAIL_PATH); - goto done; - } - - /* We need to look at the sense keys here to take clear action. - * For now simple logic: Get the lun from the inquiry page. - */ - sp = &h->inq.c8; - h->lun = sp->lun[7]; /* currently it uses only one byte */ - h->cmd_to_send = SEND_C9_INQUIRY; - queue_work(rdac_wkqd, &h->work); -done: - __blk_put_request(req->q, req); -} - -static void submit_inquiry(struct rdac_handler *h, int page_code, - unsigned int len, rq_end_io_fn endio) -{ - struct request *rq; - struct request_queue *q = bdev_get_queue(h->path->dev->bdev); - - if (!q) - goto fail_path; - - rq = get_rdac_req(h, &h->inq, len, READ); - if (!rq) - goto fail_path; - - /* Prepare the command. */ - rq->cmd[0] = INQUIRY; - rq->cmd[1] = 1; - rq->cmd[2] = page_code; - rq->cmd[4] = len; - rq->cmd_len = COMMAND_SIZE(INQUIRY); - blk_execute_rq_nowait(q, NULL, rq, 1, endio); - return; - -fail_path: - dm_pg_init_complete(h->path, MP_FAIL_PATH); -} - -static void service_wkq(struct work_struct *work) -{ - struct rdac_handler *h = container_of(work, struct rdac_handler, work); - - switch (h->cmd_to_send) { - case SEND_C2_INQUIRY: - submit_inquiry(h, 0xC2, sizeof(struct c2_inquiry), c2_endio); - break; - case SEND_C4_INQUIRY: - submit_inquiry(h, 0xC4, sizeof(struct c4_inquiry), c4_endio); - break; - case SEND_C8_INQUIRY: - submit_inquiry(h, 0xC8, sizeof(struct c8_inquiry), c8_endio); - break; - case SEND_C9_INQUIRY: - submit_inquiry(h, 0xC9, sizeof(struct c9_inquiry), c9_endio); - break; - case SEND_MODE_SELECT: - submit_mode_select(h); - break; - default: - BUG(); - } -} -/* - * only support subpage2c until we confirm that this is just a matter of - * of updating firmware or not, and RDAC (basic AVT works already) for now - * but we can add these in in when we get time and testers - */ -static int rdac_create(struct hw_handler *hwh, unsigned argc, char **argv) -{ - struct rdac_handler *h; - unsigned timeout; - - if (argc == 0) { - /* No arguments: use defaults */ - timeout = RDAC_FAILOVER_TIMEOUT; - } else if (argc != 1) { - DMWARN("incorrect number of arguments"); - return -EINVAL; - } else { - if (sscanf(argv[1], "%u", &timeout) != 1) { - DMWARN("invalid timeout value"); - return -EINVAL; - } - } - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - hwh->context = h; - h->timeout = timeout; - h->lun = UNINITIALIZED_LUN; - INIT_WORK(&h->work, service_wkq); - DMWARN("using RDAC command with timeout %u", h->timeout); - - return 0; -} - -static void rdac_destroy(struct hw_handler *hwh) -{ - struct rdac_handler *h = hwh->context; - - if (h->ctlr) - kref_put(&h->ctlr->kref, release_ctlr); - kfree(h); - hwh->context = NULL; -} - -static unsigned rdac_error(struct hw_handler *hwh, struct bio *bio) -{ - /* Try default handler */ - return dm_scsi_err_handler(hwh, bio); -} - -static void rdac_pg_init(struct hw_handler *hwh, unsigned bypassed, - struct dm_path *path) -{ - struct rdac_handler *h = hwh->context; - - h->path = path; - switch (h->lun) { - case UNINITIALIZED_LUN: - submit_inquiry(h, 0xC8, sizeof(struct c8_inquiry), c8_endio); - break; - default: - submit_inquiry(h, 0xC9, sizeof(struct c9_inquiry), c9_endio); - } -} - -static struct hw_handler_type rdac_handler = { - .name = RDAC_DM_HWH_NAME, - .module = THIS_MODULE, - .create = rdac_create, - .destroy = rdac_destroy, - .pg_init = rdac_pg_init, - .error = rdac_error, -}; - -static int __init rdac_init(void) -{ - int r; - - rdac_wkqd = create_singlethread_workqueue("rdac_wkqd"); - if (!rdac_wkqd) { - DMERR("Failed to create workqueue rdac_wkqd."); - return -ENOMEM; - } - - r = dm_register_hw_handler(&rdac_handler); - if (r < 0) { - DMERR("%s: register failed %d", RDAC_DM_HWH_NAME, r); - destroy_workqueue(rdac_wkqd); - return r; - } - - DMINFO("%s: version %s loaded", RDAC_DM_HWH_NAME, RDAC_DM_HWH_VER); - return 0; -} - -static void __exit rdac_exit(void) -{ - int r = dm_unregister_hw_handler(&rdac_handler); - - destroy_workqueue(rdac_wkqd); - if (r < 0) - DMERR("%s: unregister failed %d", RDAC_DM_HWH_NAME, r); -} - -module_init(rdac_init); -module_exit(rdac_exit); - -MODULE_DESCRIPTION("DM Multipath LSI/Engenio RDAC support"); -MODULE_AUTHOR("Mike Christie, Chandra Seetharaman"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(RDAC_DM_HWH_VER); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e7ee59e655d5..9f7302d4878d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -7,7 +7,6 @@ #include "dm.h" #include "dm-path-selector.h" -#include "dm-hw-handler.h" #include "dm-bio-list.h" #include "dm-bio-record.h" #include "dm-uevent.h" @@ -20,6 +19,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/workqueue.h> +#include <scsi/scsi_dh.h> #include <asm/atomic.h> #define DM_MSG_PREFIX "multipath" @@ -61,7 +61,8 @@ struct multipath { spinlock_t lock; - struct hw_handler hw_handler; + const char *hw_handler_name; + struct work_struct activate_path; unsigned nr_priority_groups; struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ @@ -106,9 +107,10 @@ typedef int (*action_fn) (struct pgpath *pgpath); static struct kmem_cache *_mpio_cache; -static struct workqueue_struct *kmultipathd; +static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); +static void activate_path(struct work_struct *work); /*----------------------------------------------- @@ -178,6 +180,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); + INIT_WORK(&m->activate_path, activate_path); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -193,18 +196,13 @@ static struct multipath *alloc_multipath(struct dm_target *ti) static void free_multipath(struct multipath *m) { struct priority_group *pg, *tmp; - struct hw_handler *hwh = &m->hw_handler; list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { list_del(&pg->list); free_priority_group(pg, m->ti); } - if (hwh->type) { - hwh->type->destroy(hwh); - dm_put_hw_handler(hwh->type); - } - + kfree(m->hw_handler_name); mempool_destroy(m->mpio_pool); kfree(m); } @@ -216,12 +214,10 @@ static void free_multipath(struct multipath *m) static void __switch_pg(struct multipath *m, struct pgpath *pgpath) { - struct hw_handler *hwh = &m->hw_handler; - m->current_pg = pgpath->pg; /* Must we initialise the PG first, and queue I/O till it's ready? */ - if (hwh->type && hwh->type->pg_init) { + if (m->hw_handler_name) { m->pg_init_required = 1; m->queue_io = 1; } else { @@ -409,7 +405,6 @@ static void process_queued_ios(struct work_struct *work) { struct multipath *m = container_of(work, struct multipath, process_queued_ios); - struct hw_handler *hwh = &m->hw_handler; struct pgpath *pgpath = NULL; unsigned init_required = 0, must_queue = 1; unsigned long flags; @@ -439,7 +434,7 @@ out: spin_unlock_irqrestore(&m->lock, flags); if (init_required) - hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path); + queue_work(kmpath_handlerd, &m->activate_path); if (!must_queue) dispatch_queued_ios(m); @@ -652,8 +647,6 @@ static struct priority_group *parse_priority_group(struct arg_set *as, static int parse_hw_handler(struct arg_set *as, struct multipath *m) { - int r; - struct hw_handler_type *hwht; unsigned hw_argc; struct dm_target *ti = m->ti; @@ -661,30 +654,20 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) {0, 1024, "invalid number of hardware handler args"}, }; - r = read_param(_params, shift(as), &hw_argc, &ti->error); - if (r) + if (read_param(_params, shift(as), &hw_argc, &ti->error)) return -EINVAL; if (!hw_argc) return 0; - hwht = dm_get_hw_handler(shift(as)); - if (!hwht) { + m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); + request_module("scsi_dh_%s", m->hw_handler_name); + if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { ti->error = "unknown hardware handler type"; + kfree(m->hw_handler_name); + m->hw_handler_name = NULL; return -EINVAL; } - - m->hw_handler.md = dm_table_get_md(ti->table); - dm_put(m->hw_handler.md); - - r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); - if (r) { - dm_put_hw_handler(hwht); - ti->error = "hardware handler constructor failed"; - return r; - } - - m->hw_handler.type = hwht; consume(as, hw_argc - 1); return 0; @@ -808,6 +791,7 @@ static void multipath_dtr(struct dm_target *ti) { struct multipath *m = (struct multipath *) ti->private; + flush_workqueue(kmpath_handlerd); flush_workqueue(kmultipathd); free_multipath(m); } @@ -1025,52 +1009,85 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) return limit_reached; } -/* - * pg_init must call this when it has completed its initialisation - */ -void dm_pg_init_complete(struct dm_path *path, unsigned err_flags) +static void pg_init_done(struct dm_path *path, int errors) { struct pgpath *pgpath = path_to_pgpath(path); struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; - /* - * If requested, retry pg_init until maximum number of retries exceeded. - * If retry not requested and PG already bypassed, always fail the path. - */ - if (err_flags & MP_RETRY) { - if (pg_init_limit_reached(m, pgpath)) - err_flags |= MP_FAIL_PATH; - } else if (err_flags && pg->bypassed) - err_flags |= MP_FAIL_PATH; - - if (err_flags & MP_FAIL_PATH) + /* device or driver problems */ + switch (errors) { + case SCSI_DH_OK: + break; + case SCSI_DH_NOSYS: + if (!m->hw_handler_name) { + errors = 0; + break; + } + DMERR("Cannot failover device because scsi_dh_%s was not " + "loaded.", m->hw_handler_name); + /* + * Fail path for now, so we do not ping pong + */ fail_path(pgpath); - - if (err_flags & MP_BYPASS_PG) + break; + case SCSI_DH_DEV_TEMP_BUSY: + /* + * Probably doing something like FW upgrade on the + * controller so try the other pg. + */ bypass_pg(m, pg, 1); + break; + /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ + case SCSI_DH_RETRY: + case SCSI_DH_IMM_RETRY: + case SCSI_DH_RES_TEMP_UNAVAIL: + if (pg_init_limit_reached(m, pgpath)) + fail_path(pgpath); + errors = 0; + break; + default: + /* + * We probably do not want to fail the path for a device + * error, but this is what the old dm did. In future + * patches we can do more advanced handling. + */ + fail_path(pgpath); + } spin_lock_irqsave(&m->lock, flags); - if (err_flags & ~MP_RETRY) { + if (errors) { + DMERR("Could not failover device. Error %d.", errors); m->current_pgpath = NULL; m->current_pg = NULL; - } else if (!m->pg_init_required) + } else if (!m->pg_init_required) { m->queue_io = 0; + pg->bypassed = 0; + } m->pg_init_in_progress = 0; queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); } +static void activate_path(struct work_struct *work) +{ + int ret; + struct multipath *m = + container_of(work, struct multipath, activate_path); + struct dm_path *path = &m->current_pgpath->path; + + ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); + pg_init_done(path, ret); +} + /* * end_io handling */ static int do_end_io(struct multipath *m, struct bio *bio, int error, struct dm_mpath_io *mpio) { - struct hw_handler *hwh = &m->hw_handler; - unsigned err_flags = MP_FAIL_PATH; /* Default behavior */ unsigned long flags; if (!error) @@ -1097,19 +1114,8 @@ static int do_end_io(struct multipath *m, struct bio *bio, } spin_unlock_irqrestore(&m->lock, flags); - if (hwh->type && hwh->type->error) - err_flags = hwh->type->error(hwh, bio); - - if (mpio->pgpath) { - if (err_flags & MP_FAIL_PATH) - fail_path(mpio->pgpath); - - if (err_flags & MP_BYPASS_PG) - bypass_pg(m, mpio->pgpath->pg, 1); - } - - if (err_flags & MP_ERROR_IO) - return -EIO; + if (mpio->pgpath) + fail_path(mpio->pgpath); requeue: dm_bio_restore(&mpio->details, bio); @@ -1194,7 +1200,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type, int sz = 0; unsigned long flags; struct multipath *m = (struct multipath *) ti->private; - struct hw_handler *hwh = &m->hw_handler; struct priority_group *pg; struct pgpath *p; unsigned pg_num; @@ -1214,12 +1219,10 @@ static int multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("pg_init_retries %u ", m->pg_init_retries); } - if (hwh->type && hwh->type->status) - sz += hwh->type->status(hwh, type, result + sz, maxlen - sz); - else if (!hwh->type || type == STATUSTYPE_INFO) + if (!m->hw_handler_name || type == STATUSTYPE_INFO) DMEMIT("0 "); else - DMEMIT("1 %s ", hwh->type->name); + DMEMIT("1 %s ", m->hw_handler_name); DMEMIT("%u ", m->nr_priority_groups); @@ -1422,6 +1425,21 @@ static int __init dm_multipath_init(void) return -ENOMEM; } + /* + * A separate workqueue is used to handle the device handlers + * to avoid overloading existing workqueue. Overloading the + * old workqueue would also create a bottleneck in the + * path of the storage hardware device activation. + */ + kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + if (!kmpath_handlerd) { + DMERR("failed to create workqueue kmpath_handlerd"); + destroy_workqueue(kmultipathd); + dm_unregister_target(&multipath_target); + kmem_cache_destroy(_mpio_cache); + return -ENOMEM; + } + DMINFO("version %u.%u.%u loaded", multipath_target.version[0], multipath_target.version[1], multipath_target.version[2]); @@ -1433,6 +1451,7 @@ static void __exit dm_multipath_exit(void) { int r; + destroy_workqueue(kmpath_handlerd); destroy_workqueue(kmultipathd); r = dm_unregister_target(&multipath_target); @@ -1441,8 +1460,6 @@ static void __exit dm_multipath_exit(void) kmem_cache_destroy(_mpio_cache); } -EXPORT_SYMBOL_GPL(dm_pg_init_complete); - module_init(dm_multipath_init); module_exit(dm_multipath_exit); diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h index b9cdcbb3ed59..c198b856a452 100644 --- a/drivers/md/dm-mpath.h +++ b/drivers/md/dm-mpath.h @@ -16,7 +16,6 @@ struct dm_path { unsigned is_active; /* Read-only */ void *pscontext; /* For path-selector use */ - void *hwhcontext; /* For hw-handler use */ }; /* Callback for hwh_pg_init_fn to use when complete */ diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 10748240cb2f..6a866d7c8ae5 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /** * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; dev_info_t *dev0; - unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 83eb78b00137..2580ac1b9b0f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -74,6 +74,8 @@ static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); +static DECLARE_WAIT_QUEUE_HEAD(resync_wait); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -274,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); + init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; new->level = LEVEL_NONE; @@ -3013,6 +3016,36 @@ degraded_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t +sync_force_parallel_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->parallel_resync); +} + +static ssize_t +sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) +{ + long n; + + if (strict_strtol(buf, 10, &n)) + return -EINVAL; + + if (n != 0 && n != 1) + return -EINVAL; + + mddev->parallel_resync = n; + + if (mddev->sync_thread) + wake_up(&resync_wait); + + return len; +} + +/* force parallel resync, even with shared block devices */ +static struct md_sysfs_entry md_sync_force_parallel = +__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, + sync_force_parallel_show, sync_force_parallel_store); + +static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; @@ -3187,6 +3220,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_min.attr, &md_sync_max.attr, &md_sync_speed.attr, + &md_sync_force_parallel.attr, &md_sync_completed.attr, &md_max_sync.attr, &md_suspend_lo.attr, @@ -3691,6 +3725,8 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + /* tell userspace to handle 'inactive' */ + sysfs_notify(&mddev->kobj, NULL, "array_state"); set_capacity(disk, 0); mddev->changed = 1; @@ -3861,8 +3897,10 @@ static void autorun_devices(int part) md_probe(dev, NULL, NULL); mddev = mddev_find(dev); - if (!mddev) { - printk(KERN_ERR + if (!mddev || !mddev->gendisk) { + if (mddev) + mddev_put(mddev); + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); break; } @@ -3987,8 +4025,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) if (!buf) goto out; - ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); - if (!ptr) + ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + if (IS_ERR(ptr)) goto out; strcpy(file->pathname, ptr); @@ -5399,7 +5437,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -5435,8 +5473,11 @@ void md_write_start(mddev_t *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); + sysfs_notify(&mddev->kobj, NULL, "array_state"); } - wait_event(mddev->sb_wait, mddev->flags==0); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } void md_write_end(mddev_t *mddev) @@ -5471,13 +5512,17 @@ void md_allow_write(mddev_t *mddev) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); + + sysfs_notify(&mddev->kobj, NULL, "array_state"); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } else spin_unlock_irq(&mddev->write_lock); } EXPORT_SYMBOL_GPL(md_allow_write); -static DECLARE_WAIT_QUEUE_HEAD(resync_wait); - #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) @@ -5541,8 +5586,9 @@ void md_do_sync(mddev_t *mddev) for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; - if (mddev2->curr_resync && - match_mddev_units(mddev,mddev2)) { + if (!mddev->parallel_resync + && mddev2->curr_resync + && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); if (mddev < mddev2 && mddev->curr_resync == 2) { /* arbitrarily yield */ @@ -5622,7 +5668,6 @@ void md_do_sync(mddev_t *mddev) window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); last_check = 0; if (j>2) { @@ -5647,7 +5692,7 @@ void md_do_sync(mddev_t *mddev) sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto out; } @@ -5670,8 +5715,7 @@ void md_do_sync(mddev_t *mddev) last_check = io_sectors; - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || - test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; repeat: @@ -5725,8 +5769,7 @@ void md_do_sync(mddev_t *mddev) /* tell personality that we are finished */ mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5795,7 +5838,10 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - rdev_for_each(rdev, rtmp, mddev) + rdev_for_each(rdev, rtmp, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags)) + spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5813,6 +5859,7 @@ static int remove_and_add_spares(mddev_t *mddev) } else break; } + } } return spares; } @@ -5826,7 +5873,7 @@ static int remove_and_add_spares(mddev_t *mddev) * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. - * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * @@ -5901,8 +5948,7 @@ void md_check_recovery(mddev_t *mddev) /* resync has finished, collect result */ md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* success...*/ /* activate any spares */ mddev->pers->spare_active(mddev); @@ -5926,7 +5972,6 @@ void md_check_recovery(mddev_t *mddev) * might be left set */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - clear_bit(MD_RECOVERY_ERR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4f4d1f383842..e968116e0de9 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -327,7 +327,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number) if (rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); + printk(KERN_ERR "hot-remove-disk, slot %d is identified" + " but is still operational!\n", number); err = -EBUSY; goto abort; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 914c04ddec7c..bcbb82594a19 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev) /** * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ac409b7d83f5..c610b947218a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -773,7 +773,7 @@ static int make_request(struct request_queue *q, struct bio * bio) r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; @@ -802,6 +802,8 @@ static int make_request(struct request_queue *q, struct bio * bio) wait_barrier(conf); + bitmap = mddev->bitmap; + disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); @@ -1025,7 +1027,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1146,6 +1148,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1282,6 +1292,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; @@ -1295,10 +1306,20 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - for (j = 0; j < vcnt ; j++) - memcpy(page_address(sbio->bi_io_vec[j].bv_page), + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), page_address(pbio->bi_io_vec[j].bv_page), PAGE_SIZE); + } } } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8536ede1e712..22bb2b1b886d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } @@ -1020,7 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1171,6 +1172,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove faulty devices in recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + enough(conf)) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1237,6 +1246,7 @@ static void end_sync_write(struct bio *bio, int error) if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); while (atomic_dec_and_test(&r10_bio->remaining)) { @@ -1844,7 +1854,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", mdname(mddev)); break; @@ -2127,6 +2138,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93fde48c0f42..9ce7154845c6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1258,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT @@ -1992,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2006,12 +2017,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ s->uptodate++; return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) @@ -2077,7 +2083,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); @@ -2635,6 +2643,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2764,9 +2773,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2800,6 +2811,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); @@ -2880,6 +2893,8 @@ static void handle_stripe5(struct stripe_head *sh) for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; } @@ -2893,6 +2908,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -3298,15 +3314,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -4287,7 +4305,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* @@ -4564,6 +4584,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { |