Commit 6140333d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (75 commits)
  md/raid10: handle further errors during fix_read_error better.
  md/raid10: Handle read errors during recovery better.
  md/raid10: simplify read error handling during recovery.
  md/raid10: record bad blocks due to write errors during resync/recovery.
  md/raid10:  attempt to fix read errors during resync/check
  md/raid10:  Handle write errors by updating badblock log.
  md/raid10: clear bad-block record when write succeeds.
  md/raid10: avoid writing to known bad blocks on known bad drives.
  md/raid10 record bad blocks as needed during recovery.
  md/raid10: avoid reading known bad blocks during resync/recovery.
  md/raid10 - avoid reading from known bad blocks - part 3
  md/raid10: avoid reading from known bad blocks - part 2
  md/raid10: avoid reading from known bad blocks - part 1
  md/raid10: Split handle_read_error out from raid10d.
  md/raid10: simplify/reindent some loops.
  md/raid5: Clear bad blocks on successful write.
  md/raid5.  Don't write to known bad block on doubtful devices.
  md/raid5: write errors should be recorded as bad blocks if possible.
  md/raid5: use bad-block log to improve handling of uncorrectable read errors.
  md/raid5: avoid reading from known bad blocks.
  ...
parents 6f56c218 58c54fcc
...@@ -360,18 +360,20 @@ Each directory contains: ...@@ -360,18 +360,20 @@ Each directory contains:
A file recording the current state of the device in the array A file recording the current state of the device in the array
which can be a comma separated list of which can be a comma separated list of
faulty - device has been kicked from active use due to faulty - device has been kicked from active use due to
a detected fault a detected fault or it has unacknowledged bad
blocks
in_sync - device is a fully in-sync member of the array in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read writemostly - device will only be subject to read
requests if there are no other options. requests if there are no other options.
This applies only to raid1 arrays. This applies only to raid1 arrays.
blocked - device has failed, metadata is "external", blocked - device has failed, and the failure hasn't been
and the failure hasn't been acknowledged yet. acknowledged yet by the metadata handler.
Writes that would write to this device if Writes that would write to this device if
it were not faulty are blocked. it were not faulty are blocked.
spare - device is working, but not a full member. spare - device is working, but not a full member.
This includes spares that are in the process This includes spares that are in the process
of being recovered to of being recovered to
write_error - device has ever seen a write error.
This list may grow in future. This list may grow in future.
This can be written to. This can be written to.
Writing "faulty" simulates a failure on the device. Writing "faulty" simulates a failure on the device.
...@@ -379,9 +381,11 @@ Each directory contains: ...@@ -379,9 +381,11 @@ Each directory contains:
Writing "writemostly" sets the writemostly flag. Writing "writemostly" sets the writemostly flag.
Writing "-writemostly" clears the writemostly flag. Writing "-writemostly" clears the writemostly flag.
Writing "blocked" sets the "blocked" flag. Writing "blocked" sets the "blocked" flag.
Writing "-blocked" clears the "blocked" flag and allows writes Writing "-blocked" clears the "blocked" flags and allows writes
to complete. to complete and possibly simulates an error.
Writing "in_sync" sets the in_sync flag. Writing "in_sync" sets the in_sync flag.
Writing "write_error" sets writeerrorseen flag.
Writing "-write_error" clears writeerrorseen flag.
This file responds to select/poll. Any change to 'faulty' This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event. or 'blocked' causes an event.
...@@ -419,7 +423,6 @@ Each directory contains: ...@@ -419,7 +423,6 @@ Each directory contains:
written, it will be rejected. written, it will be rejected.
recovery_start recovery_start
When the device is not 'in_sync', this records the number of When the device is not 'in_sync', this records the number of
sectors from the start of the device which are known to be sectors from the start of the device which are known to be
correct. This is normally zero, but during a recovery correct. This is normally zero, but during a recovery
...@@ -435,6 +438,20 @@ Each directory contains: ...@@ -435,6 +438,20 @@ Each directory contains:
Setting this to 'none' is equivalent to setting 'in_sync'. Setting this to 'none' is equivalent to setting 'in_sync'.
Setting to any other value also clears the 'in_sync' flag. Setting to any other value also clears the 'in_sync' flag.
bad_blocks
This gives the list of all known bad blocks in the form of
start address and length (in sectors respectively). If output
is too big to fit in a page, it will be truncated. Writing
"sector length" to this file adds new acknowledged (i.e.
recorded to disk safely) bad blocks.
unacknowledged_bad_blocks
This gives the list of known-but-not-yet-saved-to-disk bad
blocks in the same form of 'bad_blocks'. If output is too big
to fit in a page, it will be truncated. Writing to this file
adds bad blocks without acknowledging them. This is largely
for testing.
An active md device will also contain and entry for each active device An active md device will also contain and entry for each active device
......
...@@ -29,7 +29,6 @@ ...@@ -29,7 +29,6 @@
#include "md.h" #include "md.h"
#include "bitmap.h" #include "bitmap.h"
#include <linux/dm-dirty-log.h>
/* debug macros */ /* debug macros */
#define DEBUG 0 #define DEBUG 0
...@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon ...@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
* 0 or page 1 * 0 or page 1
*/ */
static inline struct page *filemap_get_page(struct bitmap *bitmap, static inline struct page *filemap_get_page(struct bitmap *bitmap,
unsigned long chunk) unsigned long chunk)
{ {
if (bitmap->filemap == NULL)
return NULL;
if (file_page_index(bitmap, chunk) >= bitmap->file_pages) if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
return NULL; return NULL;
return bitmap->filemap[file_page_index(bitmap, chunk) return bitmap->filemap[file_page_index(bitmap, chunk)
...@@ -878,28 +875,19 @@ enum bitmap_page_attr { ...@@ -878,28 +875,19 @@ enum bitmap_page_attr {
static inline void set_page_attr(struct bitmap *bitmap, struct page *page, static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
if (page) __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
else
__set_bit(attr, &bitmap->logattrs);
} }
static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
if (page) __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
else
__clear_bit(attr, &bitmap->logattrs);
} }
static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
if (page) return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
else
return test_bit(attr, &bitmap->logattrs);
} }
/* /*
...@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p ...@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
{ {
unsigned long bit; unsigned long bit;
struct page *page = NULL; struct page *page;
void *kaddr; void *kaddr;
unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
if (!bitmap->filemap) { if (!bitmap->filemap)
struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; return;
if (log)
log->type->mark_region(log, chunk);
} else {
page = filemap_get_page(bitmap, chunk); page = filemap_get_page(bitmap, chunk);
if (!page) if (!page)
return; return;
bit = file_page_offset(bitmap, chunk); bit = file_page_offset(bitmap, chunk);
/* set the bit */ /* set the bit */
kaddr = kmap_atomic(page, KM_USER0); kaddr = kmap_atomic(page, KM_USER0);
if (bitmap->flags & BITMAP_HOSTENDIAN) if (bitmap->flags & BITMAP_HOSTENDIAN)
set_bit(bit, kaddr); set_bit(bit, kaddr);
else else
__test_and_set_bit_le(bit, kaddr); __set_bit_le(bit, kaddr);
kunmap_atomic(kaddr, KM_USER0); kunmap_atomic(kaddr, KM_USER0);
PRINTK("set file bit %lu page %lu\n", bit, page->index); PRINTK("set file bit %lu page %lu\n", bit, page->index);
}
/* record page number so it gets flushed to disk when unplug occurs */ /* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
} }
...@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap) ...@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
if (!bitmap) if (!bitmap)
return; return;
if (!bitmap->filemap) {
/* Must be using a dirty_log */
struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
if (dirty || need_write)
if (log->type->flush(log))
bitmap->flags |= BITMAP_WRITE_ERROR;
goto out;
}
/* look at each page to see if there are any set bits that need to be /* look at each page to see if there are any set bits that need to be
* flushed out to disk */ * flushed out to disk */
...@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap) ...@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
else else
md_super_wait(bitmap->mddev); md_super_wait(bitmap->mddev);
} }
out:
if (bitmap->flags & BITMAP_WRITE_ERROR) if (bitmap->flags & BITMAP_WRITE_ERROR)
bitmap_file_kick(bitmap); bitmap_file_kick(bitmap);
} }
...@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev) ...@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
struct page *page = NULL, *lastpage = NULL; struct page *page = NULL, *lastpage = NULL;
sector_t blocks; sector_t blocks;
void *paddr; void *paddr;
struct dm_dirty_log *log = mddev->bitmap_info.log;
/* Use a mutex to guard daemon_work against /* Use a mutex to guard daemon_work against
* bitmap_destroy. * bitmap_destroy.
...@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev) ...@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_lock_irqsave(&bitmap->lock, flags); spin_lock_irqsave(&bitmap->lock, flags);
for (j = 0; j < bitmap->chunks; j++) { for (j = 0; j < bitmap->chunks; j++) {
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
if (!bitmap->filemap) { if (!bitmap->filemap)
if (!log) /* error or shutdown */
/* error or shutdown */ break;
break;
} else page = filemap_get_page(bitmap, j);
page = filemap_get_page(bitmap, j);
if (page != lastpage) { if (page != lastpage) {
/* skip this page unless it's marked as needing cleaning */ /* skip this page unless it's marked as needing cleaning */
...@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev) ...@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
-1); -1);
/* clear the bit */ /* clear the bit */
if (page) { paddr = kmap_atomic(page, KM_USER0);
paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN)
if (bitmap->flags & BITMAP_HOSTENDIAN) clear_bit(file_page_offset(bitmap, j),
clear_bit(file_page_offset(bitmap, j), paddr);
paddr); else
else __clear_bit_le(
__test_and_clear_bit_le(file_page_offset(bitmap, j), file_page_offset(bitmap,
paddr); j),
kunmap_atomic(paddr, KM_USER0); paddr);
} else kunmap_atomic(paddr, KM_USER0);
log->type->clear_region(log, j);
} }
} else } else
j |= PAGE_COUNTER_MASK; j |= PAGE_COUNTER_MASK;
...@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev) ...@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
/* now sync the final page */ /* now sync the final page */
if (lastpage != NULL || log != NULL) { if (lastpage != NULL) {
spin_lock_irqsave(&bitmap->lock, flags); spin_lock_irqsave(&bitmap->lock, flags);
if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
if (lastpage) write_page(bitmap, lastpage, 0);
write_page(bitmap, lastpage, 0);
else
if (log->type->flush(log))
bitmap->flags |= BITMAP_WRITE_ERROR;
} else { } else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
...@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev) ...@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
if (!file if (!file
&& !mddev->bitmap_info.offset && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
&& !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
return 0; return 0;
BUG_ON(file && mddev->bitmap_info.offset); BUG_ON(file && mddev->bitmap_info.offset);
BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap) if (!bitmap)
...@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev) ...@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
int bitmap_load(mddev_t *mddev) int bitmap_load(mddev_t *mddev)
{ {
int err = 0; int err = 0;
sector_t start = 0;
sector_t sector = 0; sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap; struct bitmap *bitmap = mddev->bitmap;
...@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev) ...@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
} }
bitmap_close_sync(bitmap); bitmap_close_sync(bitmap);
if (mddev->bitmap_info.log) { if (mddev->degraded == 0
unsigned long i; || bitmap->events_cleared == mddev->events)
struct dm_dirty_log *log = mddev->bitmap_info.log; /* no need to keep dirty bits to optimise a
for (i = 0; i < bitmap->chunks; i++) * re-add of a missing device */
if (!log->type->in_sync(log, i, 1)) start = mddev->recovery_cp;
bitmap_set_memory_bits(bitmap,
(sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), err = bitmap_init_from_disk(bitmap, start);
1);
} else {
sector_t start = 0;
if (mddev->degraded == 0
|| bitmap->events_cleared == mddev->events)
/* no need to keep dirty bits to optimise a
* re-add of a missing device */
start = mddev->recovery_cp;
err = bitmap_init_from_disk(bitmap, start);
}
if (err) if (err)
goto out; goto out;
......
...@@ -212,10 +212,6 @@ struct bitmap { ...@@ -212,10 +212,6 @@ struct bitmap {
unsigned long file_pages; /* number of pages in the file */ unsigned long file_pages; /* number of pages in the file */
int last_page_size; /* bytes in the last page */ int last_page_size; /* bytes in the last page */
unsigned long logattrs; /* used when filemap_attr doesn't exist
* because we are working with a dirty_log
*/
unsigned long flags; unsigned long flags;
int allclean; int allclean;
...@@ -237,7 +233,6 @@ struct bitmap { ...@@ -237,7 +233,6 @@ struct bitmap {
wait_queue_head_t behind_wait; wait_queue_head_t behind_wait;
struct sysfs_dirent *sysfs_can_clear; struct sysfs_dirent *sysfs_can_clear;
}; };
/* the bitmap API */ /* the bitmap API */
......
This diff is collapsed.
...@@ -29,6 +29,13 @@ ...@@ -29,6 +29,13 @@
typedef struct mddev_s mddev_t; typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t; typedef struct mdk_rdev_s mdk_rdev_t;
/* Bad block numbers are stored sorted in a single page.
* 64bits is used for each block or extent.
* 54 bits are sector number, 9 bits are extent size,
* 1 bit is an 'acknowledged' flag.
*/
#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
/* /*
* MD's 'extended' device * MD's 'extended' device
*/ */
...@@ -48,7 +55,7 @@ struct mdk_rdev_s ...@@ -48,7 +55,7 @@ struct mdk_rdev_s
struct block_device *meta_bdev; struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */ struct block_device *bdev; /* block device handle */
struct page *sb_page; struct page *sb_page, *bb_page;
int sb_loaded; int sb_loaded;
__u64 sb_events; __u64 sb_events;
sector_t data_offset; /* start of data in array */ sector_t data_offset; /* start of data in array */
...@@ -74,9 +81,29 @@ struct mdk_rdev_s ...@@ -74,9 +81,29 @@ struct mdk_rdev_s
#define In_sync 2 /* device is in_sync with rest of array */ #define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */ #define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */ #define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occurred on an externally #define Blocked 8 /* An error occurred but has not yet
* managed array, don't allow writes * been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */ * until it is cleared */
#define WriteErrorSeen 9 /* A write error has been seen on this
* device
*/
#define FaultRecorded 10 /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
#define BlockedBadBlocks 11 /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
wait_queue_head_t blocked_wait; wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
...@@ -111,8 +138,54 @@ struct mdk_rdev_s ...@@ -111,8 +138,54 @@ struct mdk_rdev_s
struct sysfs_dirent *sysfs_state; /* handle for 'state' struct sysfs_dirent *sysfs_state; /* handle for 'state'
* sysfs entry */ * sysfs entry */
struct badblocks {
int count; /* count of bad blocks */
int unacked_exist; /* there probably are unacknowledged
* bad blocks. This is only cleared
* when a read discovers none
*/
int shift; /* shift from sectors to block size
* a -ve shift means badblocks are
* disabled.*/
u64 *page; /* badblock list */
int changed;
seqlock_t lock;
sector_t sector;
sector_t size; /* in sectors */
} badblocks;
}; };
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
#define BB_ACK_MASK (0x8000000000000000ULL)
#define BB_MAX_LEN 512
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
sectors,
first_bad, bad_sectors);
if (rv)
*first_bad -= rdev->data_offset;
return rv;
}
return 0;
}
extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
int acknowledged);
extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
extern void md_ack_all_badblocks(struct badblocks *bb);
struct mddev_s struct mddev_s
{ {
void *private; void *private;
...@@ -239,9 +312,12 @@ struct mddev_s ...@@ -239,9 +312,12 @@ struct mddev_s
#define MD_RECOVERY_FROZEN 9 #define MD_RECOVERY_FROZEN 9
unsigned long recovery; unsigned long recovery;
int recovery_disabled; /* if we detect that recovery /* If a RAID personality determines that recovery (of a particular
* will always fail, set this * device) will fail due to a read error on the source device, it
* so we don't loop trying */ * takes a copy of this number and does not attempt recovery again
* until this number changes.
*/
int recovery_disabled;
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
...@@ -304,11 +380,6 @@ struct mddev_s ...@@ -304,11 +380,6 @@ struct mddev_s
* hot-adding a bitmap. It should * hot-adding a bitmap. It should
* eventually be settable by sysfs. * eventually be settable by sysfs.
*/ */
/* When md is serving under dm, it might use a
* dirty_log to store the bits.
*/
struct dm_dirty_log *log;
struct mutex mutex; struct mutex mutex;
unsigned long chunksize; unsigned long chunksize;
unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long daemon_sleep; /* how many jiffies between updates? */
...@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) ...@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
} }
static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
}
static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
/* /*
* iterates through some rdev ringlist. It's safe to remove the * iterates through some rdev ringlist. It's safe to remove the
* current 'rdev'. Dont touch 'tmp' though. * current 'rdev'. Dont touch 'tmp' though.
...@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); ...@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
extern int md_run(mddev_t *mddev); extern int md_run(mddev_t *mddev);
extern void md_stop(mddev_t *mddev); extern void md_stop(mddev_t *mddev);
extern void md_stop_writes(mddev_t *mddev); extern void md_stop_writes(mddev_t *mddev);
extern void md_rdev_init(mdk_rdev_t *rdev); extern int md_rdev_init(mdk_rdev_t *rdev);
extern void mddev_suspend(mddev_t *mddev); extern void mddev_suspend(mddev_t *mddev);
extern void mddev_resume(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev);
...@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, ...@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
mddev_t *mddev); mddev_t *mddev);
extern int mddev_check_plugged(mddev_t *mddev); extern int mddev_check_plugged(mddev_t *mddev);
extern void md_trim_bio(struct bio *bio, int offset, int size);
#endif /* _MD_MD_H */ #endif /* _MD_MD_H */
This diff is collapsed.
...@@ -48,6 +48,12 @@ struct r1_private_data_s { ...@@ -48,6 +48,12 @@ struct r1_private_data_s {
* (fresh device added). * (fresh device added).
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
int recovery_disabled; /* when the same as
* mddev->recovery_disabled
* we don't allow recovery
* to be attempted as we
* expect a read error
*/
wait_queue_head_t wait_barrier; wait_queue_head_t wait_barrier;
...@@ -95,7 +101,7 @@ struct r1bio_s { ...@@ -95,7 +101,7 @@ struct r1bio_s {
struct list_head retry_list; struct list_head retry_list;
/* Next two are only valid when R1BIO_BehindIO is set */ /* Next two are only valid when R1BIO_BehindIO is set */
struct page **behind_pages; struct bio_vec *behind_bvecs;
int behind_page_count; int behind_page_count;
/* /*
* if the IO is in WRITE direction, then multiple bios are used. * if the IO is in WRITE direction, then multiple bios are used.
...@@ -110,13 +116,24 @@ struct r1bio_s { ...@@ -110,13 +116,24 @@ struct r1bio_s {
* correct the read error. To keep track of bad blocks on a per-bio * correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer * level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/ */
#define IO_BLOCKED ((struct bio*)1) #define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting bios[n] to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r1bio.state */ /* bits for r1bio.state */
#define R1BIO_Uptodate 0 #define R1BIO_Uptodate 0
#define R1BIO_IsSync 1 #define R1BIO_IsSync 1
#define R1BIO_Degraded 2 #define R1BIO_Degraded 2
#define R1BIO_BehindIO 3 #define R1BIO_BehindIO 3
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
*/
#define R1BIO_ReadError 4
/* For write-behind requests, we call bi_end_io when /* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing * the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when * any write was successful. Otherwise we call when
...@@ -125,6 +142,11 @@ struct r1bio_s { ...@@ -125,6 +142,11 @@ struct r1bio_s {
* Record that bi_end_io was called with this flag... * Record that bi_end_io was called with this flag...
*/ */
#define R1BIO_Returned 6 #define R1BIO_Returned 6
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag
*/
#define R1BIO_MadeGood 7
#define R1BIO_WriteError 8
extern int md_raid1_congested(mddev_t *mddev, int bits); extern int md_raid1_congested(mddev_t *mddev, int bits);
......
This diff is collapsed.
...@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t; ...@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
struct mirror_info { struct mirror_info {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
sector_t head_position; sector_t head_position;
int recovery_disabled; /* matches
* mddev->recovery_disabled
* when we shouldn't try
* recovering this device.
*/
}; };
typedef struct r10bio_s r10bio_t; typedef struct r10bio_s r10bio_t;
...@@ -113,10 +118,26 @@ struct r10bio_s { ...@@ -113,10 +118,26 @@ struct r10bio_s {
* level, we store IO_BLOCKED in the appropriate 'bios' pointer * level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/ */
#define IO_BLOCKED ((struct bio*)1) #define IO_BLOCKED ((struct bio*)1)
/* When we successfully write to a known bad-block, we need to remove the
* bad-block marking which must be done from process context. So we record
* the success by setting devs[n].bio to IO_MADE_GOOD
*/
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */ /* bits for r10bio.state */
#define R10BIO_Uptodate 0 #define R10BIO_Uptodate 0
#define R10BIO_IsSync 1 #define R10BIO_IsSync 1
#define R10BIO_IsRecover 2 #define R10BIO_IsRecover 2
#define R10BIO_Degraded 3 #define R10BIO_Degraded 3
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
#define R10BIO_ReadError 4
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
#define R10BIO_MadeGood 5
#define R10BIO_WriteError 6
#endif #endif
This diff is collapsed.
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
/* /*
* *
* Each stripe contains one buffer per disc. Each buffer can be in * Each stripe contains one buffer per device. Each buffer can be in
* one of a number of states stored in "flags". Changes between * one of a number of states stored in "flags". Changes between
* these states happen *almost* exclusively under a per-stripe * these states happen *almost* exclusively under the protection of the
* spinlock. Some very specific changes can happen in bi_end_io, and * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
* these are not protected by the spin lock. * these are not protected by STRIPE_ACTIVE.
* *
* The flag bits that are used to represent these states are: * The flag bits that are used to represent these states are:
* R5_UPTODATE and R5_LOCKED * R5_UPTODATE and R5_LOCKED
...@@ -76,12 +76,10 @@ ...@@ -76,12 +76,10 @@
* block and the cached buffer are successfully written, any buffer on * block and the cached buffer are successfully written, any buffer on
* a written list can be returned with b_end_io. * a written list can be returned with b_end_io.
* *
* The write list and read list both act as fifos. The read list is * The write list and read list both act as fifos. The read list,
* protected by the device_lock. The write and written lists are * write list and written list are protected by the device_lock.
* protected by the stripe lock. The device_lock, which can be * The device_lock is only for list manipulations and will only be
* claimed while the stipe lock is held, is only for list * held for a very short time. It can be claimed from interrupts.
* manipulations and will only be held for a very short time. It can
* be claimed from interrupts.
* *
* *
* Stripes in the stripe cache can be on one of two lists (or on * Stripes in the stripe cache can be on one of two lists (or on
...@@ -96,7 +94,6 @@ ...@@ -96,7 +94,6 @@
* *
* The inactive_list, handle_list and hash bucket lists are all protected by the * The inactive_list, handle_list and hash bucket lists are all protected by the
* device_lock. * device_lock.
* - stripes on the inactive_list never have their stripe_lock held.
* - stripes have a reference counter. If count==0, they are on a list. * - stripes have a reference counter. If count==0, they are on a list.
* - If a stripe might need handling, STRIPE_HANDLE is set. * - If a stripe might need handling, STRIPE_HANDLE is set.
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
...@@ -116,10 +113,10 @@ ...@@ -116,10 +113,10 @@
* attach a request to an active stripe (add_stripe_bh()) * attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev * lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe()) * handle a stripe (handle_stripe())
* lockstripe clrSTRIPE_HANDLE ... * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
* (lockdev check-buffers unlockdev) .. * (lockdev check-buffers unlockdev) ..
* change-state .. * change-state ..
* record io/ops needed unlockstripe schedule io/ops * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
* release an active stripe (release_stripe()) * release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
* *
...@@ -128,8 +125,7 @@ ...@@ -128,8 +125,7 @@
* on a cached buffer, and plus one if the stripe is undergoing stripe * on a cached buffer, and plus one if the stripe is undergoing stripe
* operations. * operations.
* *
* Stripe operations are performed outside the stripe lock, * The stripe operations are:
* the stripe operations are:
* -copying data between the stripe cache and user application buffers * -copying data between the stripe cache and user application buffers
* -computing blocks to save a disk access, or to recover a missing block * -computing blocks to save a disk access, or to recover a missing block
* -updating the parity on a write operation (reconstruct write and * -updating the parity on a write operation (reconstruct write and
...@@ -159,7 +155,8 @@ ...@@ -159,7 +155,8 @@
*/ */
/* /*
* Operations state - intermediate states that are visible outside of sh->lock * Operations state - intermediate states that are visible outside of
* STRIPE_ACTIVE.
* In general _idle indicates nothing is running, _run indicates a data * In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result * processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and * is stable and can be acted upon. For simple operations like biofill and
...@@ -209,7 +206,6 @@ struct stripe_head { ...@@ -209,7 +206,6 @@ struct stripe_head {
short ddf_layout;/* use DDF ordering to calculate Q */ short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */ unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */ atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */ int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */ int disks; /* disks in stripe */
enum check_states check_state; enum check_states check_state;
...@@ -240,19 +236,20 @@ struct stripe_head { ...@@ -240,19 +236,20 @@ struct stripe_head {
}; };
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head /* stripe_head_state - collects and tracks the dynamic state of a stripe_head
* for handle_stripe. It is only valid under spin_lock(sh->lock); * for handle_stripe.
*/ */
struct stripe_head_state { struct stripe_head_state {
int syncing, expanding, expanded; int syncing, expanding, expanded;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int failed_num; int failed_num[2];
int p_failed, q_failed;
int dec_preread_active;
unsigned long ops_request; unsigned long ops_request;
};
/* r6_state - extra state data only relevant to r6 */ struct bio *return_bi;
struct r6_state { mdk_rdev_t *blocked_rdev;
int p_failed, q_failed, failed_num[2]; int handle_bad_blocks;
}; };
/* Flags */ /* Flags */
...@@ -268,14 +265,16 @@ struct r6_state { ...@@ -268,14 +265,16 @@ struct r6_state {
#define R5_ReWrite 9 /* have tried to over-write the readerror */ #define R5_ReWrite 9 /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */ #define R5_Expanded 10 /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as #define R5_Wantcompute 11 /* compute_block in progress treat as
* uptodate * uptodate
*/ */
#define R5_Wantfill 12 /* dev->toread contains a bio that needs #define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling * filling
*/ */
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ #define R5_Wantdrain 13 /* dev->towrite needs to be drained */
#define R5_WantFUA 14 /* Write should be FUA */ #define R5_WantFUA 14 /* Write should be FUA */
#define R5_WriteError 15 /* got a write error - need to record it */
#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
/* /*
* Write method * Write method
*/ */
...@@ -289,21 +288,25 @@ struct r6_state { ...@@ -289,21 +288,25 @@ struct r6_state {
/* /*
* Stripe state * Stripe state
*/ */
#define STRIPE_HANDLE 2 enum {
#define STRIPE_SYNCING 3 STRIPE_ACTIVE,
#define STRIPE_INSYNC 4 STRIPE_HANDLE,
#define STRIPE_PREREAD_ACTIVE 5 STRIPE_SYNC_REQUESTED,
#define STRIPE_DELAYED 6 STRIPE_SYNCING,
#define STRIPE_DEGRADED 7 STRIPE_INSYNC,
#define STRIPE_BIT_DELAY 8 STRIPE_PREREAD_ACTIVE,
#define STRIPE_EXPANDING 9 STRIPE_DELAYED,
#define STRIPE_EXPAND_SOURCE 10 STRIPE_DEGRADED,
#define STRIPE_EXPAND_READY 11 STRIPE_BIT_DELAY,
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ STRIPE_EXPANDING,
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ STRIPE_EXPAND_SOURCE,
#define STRIPE_BIOFILL_RUN 14 STRIPE_EXPAND_READY,
#define STRIPE_COMPUTE_RUN 15 STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
#define STRIPE_OPS_REQ_PENDING 16 STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
STRIPE_BIOFILL_RUN,
STRIPE_COMPUTE_RUN,
STRIPE_OPS_REQ_PENDING,
};
/* /*
* Operation request flags * Operation request flags
...@@ -336,7 +339,7 @@ struct r6_state { ...@@ -336,7 +339,7 @@ struct r6_state {
* PREREAD_ACTIVE. * PREREAD_ACTIVE.
* In stripe_handle, if we find pre-reading is necessary, we do it if * In stripe_handle, if we find pre-reading is necessary, we do it if
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
* HANDLE gets cleared if stripe_handle leave nothing locked. * HANDLE gets cleared if stripe_handle leaves nothing locked.
*/ */
...@@ -399,7 +402,7 @@ struct raid5_private_data { ...@@ -399,7 +402,7 @@ struct raid5_private_data {
* (fresh device added). * (fresh device added).
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
int recovery_disabled;
/* per cpu variables */ /* per cpu variables */
struct raid5_percpu { struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */ struct page *spare_page; /* Used when checking P/Q in raid6 */
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment