Commit 22d2cfdf authored by Ilya Dryomov's avatar Ilya Dryomov

libceph: move away from global osd_req_flags

osd_req_flags is overly general and doesn't suit its only user
(read_from_replica option) well:

- applying osd_req_flags in account_request() affects all OSD
  requests, including linger (i.e. watch and notify).  However,
  linger requests should always go to the primary even though
  some of them are reads (e.g. notify has side effects but it
  is a read because it doesn't result in mutation on the OSDs).

- calls to class methods that are reads are allowed to go to
  the replica, but most such calls issued for "rbd map" and/or
  exclusive lock transitions are requested to be resent to the
  primary via EAGAIN, doubling the latency.

Get rid of global osd_req_flags and set read_from_replica flag
only on specific OSD requests instead.

Fixes: 8ad44d5e ("libceph: read_from_replica option")
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarJeff Layton <jlayton@kernel.org>
parent b3a9e3b9
...@@ -1451,8 +1451,10 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) ...@@ -1451,8 +1451,10 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
static void rbd_osd_format_read(struct ceph_osd_request *osd_req) static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
{ {
struct rbd_obj_request *obj_request = osd_req->r_priv; struct rbd_obj_request *obj_request = osd_req->r_priv;
struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
struct ceph_options *opt = rbd_dev->rbd_client->client->options;
osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
osd_req->r_snapid = obj_request->img_request->snap_id; osd_req->r_snapid = obj_request->img_request->snap_id;
} }
......
...@@ -52,8 +52,7 @@ struct ceph_options { ...@@ -52,8 +52,7 @@ struct ceph_options {
unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */
unsigned long osd_request_timeout; /* jiffies */ unsigned long osd_request_timeout; /* jiffies */
u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
u32 osd_req_flags; /* CEPH_OSD_FLAG_*, applied to each OSD request */
/* /*
* any type that can't be simply compared or doesn't need * any type that can't be simply compared or doesn't need
...@@ -76,6 +75,7 @@ struct ceph_options { ...@@ -76,6 +75,7 @@ struct ceph_options {
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ #define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
......
...@@ -332,6 +332,7 @@ struct ceph_options *ceph_alloc_options(void) ...@@ -332,6 +332,7 @@ struct ceph_options *ceph_alloc_options(void)
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
return opt; return opt;
} }
EXPORT_SYMBOL(ceph_alloc_options); EXPORT_SYMBOL(ceph_alloc_options);
...@@ -490,16 +491,13 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, ...@@ -490,16 +491,13 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
case Opt_read_from_replica: case Opt_read_from_replica:
switch (result.uint_32) { switch (result.uint_32) {
case Opt_read_from_replica_no: case Opt_read_from_replica_no:
opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | opt->read_from_replica = 0;
CEPH_OSD_FLAG_LOCALIZE_READS);
break; break;
case Opt_read_from_replica_balance: case Opt_read_from_replica_balance:
opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS;
opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS;
break; break;
case Opt_read_from_replica_localize: case Opt_read_from_replica_localize:
opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS;
opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS;
break; break;
default: default:
BUG(); BUG();
...@@ -613,9 +611,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, ...@@ -613,9 +611,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
} }
seq_putc(m, ','); seq_putc(m, ',');
} }
if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) {
seq_puts(m, "read_from_replica=balance,"); seq_puts(m, "read_from_replica=balance,");
} else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
seq_puts(m, "read_from_replica=localize,"); seq_puts(m, "read_from_replica=localize,");
} }
......
...@@ -1117,10 +1117,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ...@@ -1117,10 +1117,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq); truncate_size, truncate_seq);
} }
req->r_flags = flags;
req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool = layout->pool_id;
req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
req->r_flags = flags | osdc->client->options->read_from_replica;
req->r_snapid = vino.snap; req->r_snapid = vino.snap;
if (flags & CEPH_OSD_FLAG_WRITE) if (flags & CEPH_OSD_FLAG_WRITE)
...@@ -2431,14 +2431,11 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -2431,14 +2431,11 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
static void account_request(struct ceph_osd_request *req) static void account_request(struct ceph_osd_request *req)
{ {
struct ceph_osd_client *osdc = req->r_osdc;
WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
req->r_flags |= CEPH_OSD_FLAG_ONDISK; req->r_flags |= CEPH_OSD_FLAG_ONDISK;
req->r_flags |= osdc->client->options->osd_req_flags; atomic_inc(&req->r_osdc->num_requests);
atomic_inc(&osdc->num_requests);
req->r_start_stamp = jiffies; req->r_start_stamp = jiffies;
req->r_start_latency = ktime_get(); req->r_start_latency = ktime_get();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment