Commit c96aec34 authored by Mikulas Patocka's avatar Mikulas Patocka Committed by Mike Snitzer

dm stats: support precise timestamps

Make it possible to use precise timestamps with nanosecond granularity
in dm statistics.
Signed-off-by: default avatarMikulas Patocka <mpatocka@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
parent dd4c1b7d
......@@ -13,9 +13,13 @@ the range specified.
The I/O statistics counters for each step-sized area of a region are
in the same format as /sys/block/*/stat or /proc/diskstats (see:
Documentation/iostats.txt). But two extra counters (12 and 13) are
provided: total time spent reading and writing in milliseconds. All
these counters may be accessed by sending the @stats_print message to
the appropriate DM device via dmsetup.
provided: total time spent reading and writing. All these counters may
be accessed by sending the @stats_print message to the appropriate DM
device via dmsetup.
The reported times are in milliseconds and the granularity depends on
the kernel ticks. When the option precise_timestamps is used, the
reported times are in nanoseconds.
Each region has a corresponding unique identifier, which we call a
region_id, that is assigned when the region is created. The region_id
......@@ -33,7 +37,9 @@ memory is used by reading
Messages
========
@stats_create <range> <step> [<program_id> [<aux_data>]]
@stats_create <range> <step>
[<number_of_optional_arguments> <optional_arguments>...]
[<program_id> [<aux_data>]]
Create a new region and return the region_id.
......@@ -48,6 +54,17 @@ Messages
"/<number_of_areas>" - the range is subdivided into the specified
number of areas.
<number_of_optional_arguments>
The number of optional arguments
<optional_arguments>
The following optional arguments are supported
precise_timestamps - use precise timer with nanosecond resolution
instead of the "jiffies" variable. When this argument is
used, the resulting times are in nanoseconds instead of
milliseconds. Precise timestamps are a little bit slower
to obtain than jiffies-based timestamps.
<program_id>
An optional parameter. A name that uniquely identifies
the userspace owner of the range. This groups ranges together
......@@ -55,6 +72,9 @@ Messages
created and ignore those created by others.
The kernel returns this string back in the output of
@stats_list message, but it doesn't use it for anything else.
If we omit the number of optional arguments, program id must not
be a number, otherwise it would be interpreted as the number of
optional arguments.
<aux_data>
An optional parameter. A word that provides auxiliary data
......
......@@ -33,13 +33,14 @@ struct dm_stat_percpu {
struct dm_stat_shared {
atomic_t in_flight[2];
unsigned long stamp;
unsigned long long stamp;
struct dm_stat_percpu tmp;
};
struct dm_stat {
struct list_head list_entry;
int id;
unsigned stat_flags;
size_t n_entries;
sector_t start;
sector_t end;
......@@ -53,6 +54,8 @@ struct dm_stat {
struct dm_stat_shared stat_shared[0];
};
#define STAT_PRECISE_TIMESTAMPS 1
struct dm_stats_last_position {
sector_t last_sector;
unsigned last_rw;
......@@ -224,7 +227,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
}
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
sector_t step, const char *program_id, const char *aux_data,
sector_t step, unsigned stat_flags,
const char *program_id, const char *aux_data,
void (*suspend_callback)(struct mapped_device *),
void (*resume_callback)(struct mapped_device *),
struct mapped_device *md)
......@@ -265,6 +269,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (!s)
return -ENOMEM;
s->stat_flags = stat_flags;
s->n_entries = n_entries;
s->start = start;
s->end = end;
......@@ -414,18 +419,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
return 1;
}
static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
struct dm_stat_percpu *p)
{
/*
* This is racy, but so is part_round_stats_single.
*/
unsigned long now = jiffies;
unsigned in_flight_read;
unsigned in_flight_write;
unsigned long difference = now - shared->stamp;
unsigned long long now, difference;
unsigned in_flight_read, in_flight_write;
if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
now = jiffies;
else
now = ktime_to_ns(ktime_get());
difference = now - shared->stamp;
if (!difference)
return;
in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
if (in_flight_read)
......@@ -440,8 +451,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
}
static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
unsigned long bi_rw, sector_t len, bool merged,
bool end, unsigned long duration)
unsigned long bi_rw, sector_t len,
struct dm_stats_aux *stats_aux, bool end,
unsigned long duration_jiffies)
{
unsigned long idx = bi_rw & REQ_WRITE;
struct dm_stat_shared *shared = &s->stat_shared[entry];
......@@ -471,15 +483,18 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
p = &s->stat_percpu[smp_processor_id()][entry];
if (!end) {
dm_stat_round(shared, p);
dm_stat_round(s, shared, p);
atomic_inc(&shared->in_flight[idx]);
} else {
dm_stat_round(shared, p);
dm_stat_round(s, shared, p);
atomic_dec(&shared->in_flight[idx]);
p->sectors[idx] += len;
p->ios[idx] += 1;
p->merges[idx] += merged;
p->ticks[idx] += duration;
p->merges[idx] += stats_aux->merged;
if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
p->ticks[idx] += duration_jiffies;
else
p->ticks[idx] += stats_aux->duration_ns;
}
#if BITS_PER_LONG == 32
......@@ -491,7 +506,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
sector_t bi_sector, sector_t end_sector,
bool end, unsigned long duration,
bool end, unsigned long duration_jiffies,
struct dm_stats_aux *stats_aux)
{
sector_t rel_sector, offset, todo, fragment_len;
......@@ -520,7 +535,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
if (fragment_len > s->step - offset)
fragment_len = s->step - offset;
dm_stat_for_entry(s, entry, bi_rw, fragment_len,
stats_aux->merged, end, duration);
stats_aux, end, duration_jiffies);
todo -= fragment_len;
entry++;
offset = 0;
......@@ -529,11 +544,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
unsigned long duration, struct dm_stats_aux *stats_aux)
unsigned long duration_jiffies,
struct dm_stats_aux *stats_aux)
{
struct dm_stat *s;
sector_t end_sector;
struct dm_stats_last_position *last;
bool got_precise_time;
if (unlikely(!bi_sectors))
return;
......@@ -557,8 +574,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
rcu_read_lock();
list_for_each_entry_rcu(s, &stats->list, list_entry)
__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
got_precise_time = false;
list_for_each_entry_rcu(s, &stats->list, list_entry) {
if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
if (!end)
stats_aux->duration_ns = ktime_to_ns(ktime_get());
else
stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
got_precise_time = true;
}
__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
}
rcu_read_unlock();
}
......@@ -571,7 +597,7 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
local_irq_disable();
p = &s->stat_percpu[smp_processor_id()][x];
dm_stat_round(shared, p);
dm_stat_round(s, shared, p);
local_irq_enable();
memset(&shared->tmp, 0, sizeof(shared->tmp));
......@@ -643,11 +669,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
/*
* This is like jiffies_to_msec, but works for 64-bit values.
*/
static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
{
unsigned long long result = 0;
unsigned long long result;
unsigned mult;
if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
return j;
result = 0;
if (j)
result = jiffies_to_msecs(j & 0x3fffff);
if (j >= 1 << 22) {
......@@ -709,16 +739,16 @@ static int dm_stats_print(struct dm_stats *stats, int id,
shared->tmp.ios[READ],
shared->tmp.merges[READ],
shared->tmp.sectors[READ],
dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
shared->tmp.ios[WRITE],
shared->tmp.merges[WRITE],
shared->tmp.sectors[WRITE],
dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
dm_stat_in_flight(shared),
dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
dm_jiffies_to_msec64(shared->tmp.time_in_queue),
dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
if (unlikely(sz + 1 >= maxlen))
goto buffer_overflow;
......@@ -769,21 +799,31 @@ static int message_stats_create(struct mapped_device *md,
unsigned long long start, end, len, step;
unsigned divisor;
const char *program_id, *aux_data;
unsigned stat_flags = 0;
struct dm_arg_set as, as_backup;
const char *a;
unsigned feature_args;
/*
* Input format:
* <range> <step> [<program_id> [<aux_data>]]
* <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
*/
if (argc < 3 || argc > 5)
if (argc < 3)
return -EINVAL;
if (!strcmp(argv[1], "-")) {
as.argc = argc;
as.argv = argv;
dm_consume_args(&as, 1);
a = dm_shift_arg(&as);
if (!strcmp(a, "-")) {
start = 0;
len = dm_get_size(md);
if (!len)
len = 1;
} else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
start != (sector_t)start || len != (sector_t)len)
return -EINVAL;
......@@ -791,7 +831,8 @@ static int message_stats_create(struct mapped_device *md,
if (start >= end)
return -EINVAL;
if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
a = dm_shift_arg(&as);
if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
if (!divisor)
return -EINVAL;
step = end - start;
......@@ -799,18 +840,39 @@ static int message_stats_create(struct mapped_device *md,
step++;
if (!step)
step = 1;
} else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
step != (sector_t)step || !step)
return -EINVAL;
as_backup = as;
a = dm_shift_arg(&as);
if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
while (feature_args--) {
a = dm_shift_arg(&as);
if (!a)
return -EINVAL;
if (!strcasecmp(a, "precise_timestamps"))
stat_flags |= STAT_PRECISE_TIMESTAMPS;
else
return -EINVAL;
}
} else {
as = as_backup;
}
program_id = "-";
aux_data = "-";
if (argc > 3)
program_id = argv[3];
a = dm_shift_arg(&as);
if (a)
program_id = a;
a = dm_shift_arg(&as);
if (a)
aux_data = a;
if (argc > 4)
aux_data = argv[4];
if (as.argc)
return -EINVAL;
/*
* If a buffer overflow happens after we created the region,
......@@ -822,7 +884,7 @@ static int message_stats_create(struct mapped_device *md,
if (dm_message_test_buffer_overflow(result, maxlen))
return 1;
id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
dm_internal_suspend_fast, dm_internal_resume_fast, md);
if (id < 0)
return id;
......
......@@ -18,6 +18,7 @@ struct dm_stats {
struct dm_stats_aux {
bool merged;
unsigned long long duration_ns;
};
void dm_stats_init(struct dm_stats *st);
......@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
unsigned long duration, struct dm_stats_aux *aux);
unsigned long duration_jiffies,
struct dm_stats_aux *aux);
static inline bool dm_stats_used(struct dm_stats *st)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment