Commit e6e28432 authored by Ilya Dryomov's avatar Ilya Dryomov

libceph: fix potential hang in ceph_osdc_notify()

If the cluster becomes unavailable, ceph_osdc_notify() may hang even
with osd_request_timeout option set because linger_notify_finish_wait()
waits for MWatchNotify NOTIFY_COMPLETE message with no associated OSD
request in flight -- it's completely asynchronous.

Introduce an additional timeout, derived from the specified notify
timeout.  While at it, switch both waits to killable which is more
correct.

Cc: stable@vger.kernel.org
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarDongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: default avatarXiubo Li <xiubli@redhat.com>
parent 9d01e07f
...@@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) ...@@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
int ret; int ret;
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); ret = wait_for_completion_killable(&lreq->reg_commit_wait);
return ret ?: lreq->reg_commit_error; return ret ?: lreq->reg_commit_error;
} }
static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq,
unsigned long timeout)
{ {
int ret; long left;
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait,
return ret ?: lreq->notify_finish_error; ceph_timeout_jiffies(timeout));
if (left <= 0)
left = left ?: -ETIMEDOUT;
else
left = lreq->notify_finish_error; /* completed */
return left;
} }
/* /*
...@@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, ...@@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
linger_submit(lreq); linger_submit(lreq);
ret = linger_reg_commit_wait(lreq); ret = linger_reg_commit_wait(lreq);
if (!ret) if (!ret)
ret = linger_notify_finish_wait(lreq); ret = linger_notify_finish_wait(lreq,
msecs_to_jiffies(2 * timeout * MSEC_PER_SEC));
else else
dout("lreq %p failed to initiate notify %d\n", lreq, ret); dout("lreq %p failed to initiate notify %d\n", lreq, ret);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment