Commit 700ca8c0 authored by Philipp Reisner's avatar Philipp Reisner Committed by Jens Axboe

drbd: Implement handling of thinly provisioned storage on resync target nodes

If during resync we read only zeroes for a range of sectors assume
that these secotors can be discarded on the sync target node.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent c5c23854
...@@ -471,6 +471,9 @@ enum { ...@@ -471,6 +471,9 @@ enum {
/* this originates from application on peer /* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */ * (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION, __EE_APPLICATION,
/* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
__EE_RS_THIN_REQ,
}; };
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
...@@ -485,6 +488,7 @@ enum { ...@@ -485,6 +488,7 @@ enum {
#define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE) #define EE_WRITE (1<<__EE_WRITE)
#define EE_APPLICATION (1<<__EE_APPLICATION) #define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
/* flag bits per device */ /* flag bits per device */
enum { enum {
...@@ -1123,6 +1127,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int ...@@ -1123,6 +1127,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
extern int drbd_send_bitmap(struct drbd_device *device); extern int drbd_send_bitmap(struct drbd_device *device);
extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
extern void drbd_device_cleanup(struct drbd_device *device); extern void drbd_device_cleanup(struct drbd_device *device);
void drbd_print_uuids(struct drbd_device *device, const char *text); void drbd_print_uuids(struct drbd_device *device, const char *text);
......
...@@ -1377,6 +1377,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd, ...@@ -1377,6 +1377,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
cpu_to_be64(block_id)); cpu_to_be64(block_id));
} }
int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
struct drbd_peer_request *peer_req)
{
struct drbd_socket *sock;
struct p_block_desc *p;
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
p->sector = cpu_to_be64(peer_req->i.sector);
p->blksize = cpu_to_be32(peer_req->i.size);
p->pad = 0;
return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
}
int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
sector_t sector, int size, u64 block_id) sector_t sector, int size, u64 block_id)
{ {
...@@ -3683,6 +3699,8 @@ const char *cmdname(enum drbd_packet cmd) ...@@ -3683,6 +3699,8 @@ const char *cmdname(enum drbd_packet cmd)
[P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
[P_RETRY_WRITE] = "retry_write", [P_RETRY_WRITE] = "retry_write",
[P_PROTOCOL_UPDATE] = "protocol_update", [P_PROTOCOL_UPDATE] = "protocol_update",
[P_RS_THIN_REQ] = "rs_thin_req",
[P_RS_DEALLOCATED] = "rs_deallocated",
/* enum drbd_packet, but not commands - obsoleted flags: /* enum drbd_packet, but not commands - obsoleted flags:
* P_MAY_IGNORE * P_MAY_IGNORE
......
...@@ -60,6 +60,10 @@ enum drbd_packet { ...@@ -60,6 +60,10 @@ enum drbd_packet {
* which is why I chose TRIM here, to disambiguate. */ * which is why I chose TRIM here, to disambiguate. */
P_TRIM = 0x31, P_TRIM = 0x31,
/* Only use these two if both support FF_THIN_RESYNC */
P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101, P_MAX_OPT_CMD = 0x101,
......
...@@ -1418,9 +1418,15 @@ int drbd_submit_peer_request(struct drbd_device *device, ...@@ -1418,9 +1418,15 @@ int drbd_submit_peer_request(struct drbd_device *device,
* so we can find it to present it in debugfs */ * so we can find it to present it in debugfs */
peer_req->submit_jif = jiffies; peer_req->submit_jif = jiffies;
peer_req->flags |= EE_SUBMITTED; peer_req->flags |= EE_SUBMITTED;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->active_ee); /* If this was a resync request from receive_rs_deallocated(),
spin_unlock_irq(&device->resource->req_lock); * it is already on the sync_ee list */
if (list_empty(&peer_req->w.list)) {
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
}
if (blkdev_issue_zeroout(device->ldev->backing_bdev, if (blkdev_issue_zeroout(device->ldev->backing_bdev,
sector, data_size >> 9, GFP_NOIO, false)) sector, data_size >> 9, GFP_NOIO, false))
peer_req->flags |= EE_WAS_ERROR; peer_req->flags |= EE_WAS_ERROR;
...@@ -2585,6 +2591,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet ...@@ -2585,6 +2591,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
case P_DATA_REQUEST: case P_DATA_REQUEST:
drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
break; break;
case P_RS_THIN_REQ:
case P_RS_DATA_REQUEST: case P_RS_DATA_REQUEST:
case P_CSUM_RS_REQUEST: case P_CSUM_RS_REQUEST:
case P_OV_REQUEST: case P_OV_REQUEST:
...@@ -2624,6 +2631,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet ...@@ -2624,6 +2631,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
peer_req->flags |= EE_APPLICATION; peer_req->flags |= EE_APPLICATION;
goto submit; goto submit;
case P_RS_THIN_REQ:
/* If at some point in the future we have a smart way to
find out if this data block is completely deallocated,
then we would do something smarter here than reading
the block... */
peer_req->flags |= EE_RS_THIN_REQ;
case P_RS_DATA_REQUEST: case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req; peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD; fault_type = DRBD_FAULT_RS_RD;
...@@ -4599,6 +4612,72 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet ...@@ -4599,6 +4612,72 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
return 0; return 0;
} }
static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
{
struct drbd_peer_device *peer_device;
struct p_block_desc *p = pi->data;
struct drbd_device *device;
sector_t sector;
int size, err = 0;
peer_device = conn_peer_device(connection, pi->vnr);
if (!peer_device)
return -EIO;
device = peer_device->device;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
dec_rs_pending(device);
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
const int op = REQ_OP_DISCARD;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, false, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
}
peer_req->w.cb = e_end_resync_block;
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_IS_TRIM;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->sync_ee);
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
if (err) {
spin_lock_irq(&device->resource->req_lock);
list_del(&peer_req->w.list);
spin_unlock_irq(&device->resource->req_lock);
drbd_free_peer_req(device, peer_req);
put_ldev(device);
err = 0;
goto fail;
}
inc_unacked(device);
/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
as well as drbd_rs_complete_io() */
} else {
fail:
drbd_rs_complete_io(device, sector);
drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
}
atomic_add(size >> 9, &device->rs_sect_in);
return err;
}
struct data_cmd { struct data_cmd {
int expect_payload; int expect_payload;
size_t pkt_size; size_t pkt_size;
...@@ -4626,11 +4705,14 @@ static struct data_cmd drbd_cmd_handler[] = { ...@@ -4626,11 +4705,14 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
[P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
}; };
static void drbdd(struct drbd_connection *connection) static void drbdd(struct drbd_connection *connection)
......
...@@ -1036,6 +1036,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) ...@@ -1036,6 +1036,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
return err; return err;
} }
static bool all_zero(struct drbd_peer_request *peer_req)
{
struct page *page = peer_req->pages;
unsigned int len = peer_req->i.size;
page_chain_for_each(page) {
unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
unsigned int i, words = l / sizeof(long);
unsigned long *d;
d = kmap_atomic(page);
for (i = 0; i < words; i++) {
if (d[i]) {
kunmap_atomic(d);
return false;
}
}
kunmap_atomic(d);
len -= l;
}
return true;
}
/** /**
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @w: work object. * @w: work object.
...@@ -1064,7 +1088,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) ...@@ -1064,7 +1088,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(device->state.pdsk >= D_INCONSISTENT)) { if (likely(device->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(device); inc_rs_pending(device);
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
err = drbd_send_rs_deallocated(peer_device, peer_req);
else
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else { } else {
if (__ratelimit(&drbd_ratelimit_state)) if (__ratelimit(&drbd_ratelimit_state))
drbd_err(device, "Not sending RSDataReply, " drbd_err(device, "Not sending RSDataReply, "
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment