Commit 10f6d992 authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Philipp Reisner

drbd: don't BUG_ON, if bio_add_page of a single page to an empty bio fails

Just deal with it more gracefully, if we fail to add even a single page
to an empty bio. We used to BUG_ON() there, but it has been observed in
some Xen deployment, so we need to handle that case more robustly now.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent 039312b6
...@@ -1073,6 +1073,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) ...@@ -1073,6 +1073,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
* @mdev: DRBD device. * @mdev: DRBD device.
* @e: epoch entry * @e: epoch entry
* @rw: flag field, see bio->bi_rw * @rw: flag field, see bio->bi_rw
*
* May spread the pages to multiple bios,
* depending on bio_add_page restrictions.
*
* Returns 0 if all bios have been submitted,
* -ENOMEM if we could not allocate enough bios,
* -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
* single page to an empty bio (which should never happen and likely indicates
* that the lower level IO stack is in some way broken). This has been observed
* on certain Xen deployments.
*/ */
/* TODO allocate from our own bio_set. */ /* TODO allocate from our own bio_set. */
int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
...@@ -1085,6 +1095,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, ...@@ -1085,6 +1095,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
unsigned ds = e->size; unsigned ds = e->size;
unsigned n_bios = 0; unsigned n_bios = 0;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
/* In most cases, we will only need one bio. But in case the lower /* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this * level restrictions happen to be different at this offset on this
...@@ -1110,8 +1121,17 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, ...@@ -1110,8 +1121,17 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
page_chain_for_each(page) { page_chain_for_each(page) {
unsigned len = min_t(unsigned, ds, PAGE_SIZE); unsigned len = min_t(unsigned, ds, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) { if (!bio_add_page(bio, page, len, 0)) {
/* a single page must always be possible! */ /* A single page must always be possible!
BUG_ON(bio->bi_vcnt == 0); * But in case it fails anyways,
* we deal with it, and complain (below). */
if (bio->bi_vcnt == 0) {
dev_err(DEV,
"bio_add_page failed for len=%u, "
"bi_vcnt=0 (bi_sector=%llu)\n",
len, (unsigned long long)bio->bi_sector);
err = -ENOSPC;
goto fail;
}
goto next_bio; goto next_bio;
} }
ds -= len; ds -= len;
...@@ -1137,7 +1157,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, ...@@ -1137,7 +1157,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
bios = bios->bi_next; bios = bios->bi_next;
bio_put(bio); bio_put(bio);
} }
return -ENOMEM; return err;
} }
static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
...@@ -1436,9 +1456,8 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si ...@@ -1436,9 +1456,8 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return true; return true;
/* drbd_submit_ee currently fails for one reason only: /* don't care for the reason here */
* not being able to allocate enough bios. dev_err(DEV, "submit failed, triggering re-connect\n");
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list); list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
...@@ -1837,9 +1856,8 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1837,9 +1856,8 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
return true; return true;
/* drbd_submit_ee currently fails for one reason only: /* don't care for the reason here */
* not being able to allocate enough bios. dev_err(DEV, "submit failed, triggering re-connect\n");
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list); list_del(&e->w.list);
hlist_del_init(&e->colision); hlist_del_init(&e->colision);
...@@ -1848,9 +1866,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1848,9 +1866,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
drbd_al_complete_io(mdev, e->sector); drbd_al_complete_io(mdev, e->sector);
out_interrupted: out_interrupted:
/* yes, the epoch_size now is imbalanced. drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
* but we drop the connection anyways, so we don't have a chance to
* receive a barrier... atomic_inc(&mdev->epoch_size); */
put_ldev(mdev); put_ldev(mdev);
drbd_free_ee(mdev, e); drbd_free_ee(mdev, e);
return false; return false;
...@@ -2096,9 +2112,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un ...@@ -2096,9 +2112,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
return true; return true;
/* drbd_submit_ee currently fails for one reason only: /* don't care for the reason here */
* not being able to allocate enough bios. dev_err(DEV, "submit failed, triggering re-connect\n");
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list); list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
......
...@@ -369,9 +369,10 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) ...@@ -369,9 +369,10 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
return 0; return 0;
/* drbd_submit_ee currently fails for one reason only: /* If it failed because of ENOMEM, retry should help. If it failed
* not being able to allocate enough bios. * because bio_add_page failed (probably broken lower level driver),
* Is dropping the connection going to help? */ * retry may or may not help.
* If it does not, you may need to force disconnect. */
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list); list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment