Commit 5798e4dd authored by Yishai Hadas's avatar Yishai Hadas Committed by Alex Williamson

vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase

This patch is another preparation step towards working in chunk mode.

It pre-allocates chunks for the STOP_COPY phase to let the driver use
them immediately and prevent an extra allocation upon that phase.

Before that patch we had a single large buffer that was dedicated for
the STOP_COPY phase as there was a single SAVE in the source for the
last image.

Once we'll move to chunk mode the idea is to have some small buffers
that will be used upon the STOP_COPY phase.

The driver will read-ahead from the firmware the full state in
small/optimized chunks while letting QEMU/user space read in parallel
the available data.

Each buffer holds its chunk number to let it be recognized down the road
in the coming patches.

The chunk buffer size is picked-up based on the minimum size that
firmware requires, the total full size and some max value in the driver
code which was set to 8MB to achieve some optimized downtime in the
general case.

As the chunk mode is applicable even if we move directly to STOP_COPY
the buffers preparation and some other related stuff is done
unconditionally with regards to STOP/PRE-COPY.

Note:
In that phase in the series we still didn't activate the chunk mode and
the first buffer will be used in all the places.
Signed-off-by: default avatarYishai Hadas <yishaih@nvidia.com>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.comSigned-off-by: default avatarAlex Williamson <alex.williamson@redhat.com>
parent 9114100d
...@@ -632,9 +632,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, ...@@ -632,9 +632,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
} }
if (MLX5VF_PRE_COPY_SUPP(mvdev)) { if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
if (async_data->stop_copy_chunk && migf->buf_header) { if (async_data->stop_copy_chunk && migf->buf_header[0]) {
header_buf = migf->buf_header; header_buf = migf->buf_header[0];
migf->buf_header = NULL; migf->buf_header[0] = NULL;
} else { } else {
header_buf = mlx5vf_get_data_buffer(migf, header_buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE); sizeof(struct mlx5_vf_migration_header), DMA_NONE);
...@@ -721,18 +721,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) ...@@ -721,18 +721,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
{ {
struct mlx5_vhca_data_buffer *entry; struct mlx5_vhca_data_buffer *entry;
int i;
lockdep_assert_held(&migf->mvdev->state_mutex); lockdep_assert_held(&migf->mvdev->state_mutex);
WARN_ON(migf->mvdev->mdev_detach); WARN_ON(migf->mvdev->mdev_detach);
if (migf->buf) { for (i = 0; i < MAX_NUM_CHUNKS; i++) {
mlx5vf_free_data_buffer(migf->buf); if (migf->buf[i]) {
migf->buf = NULL; mlx5vf_free_data_buffer(migf->buf[i]);
} migf->buf[i] = NULL;
}
if (migf->buf_header) { if (migf->buf_header[i]) {
mlx5vf_free_data_buffer(migf->buf_header); mlx5vf_free_data_buffer(migf->buf_header[i]);
migf->buf_header = NULL; migf->buf_header[i] = NULL;
}
} }
list_splice(&migf->avail_list, &migf->buf_list); list_splice(&migf->avail_list, &migf->buf_list);
......
...@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer { ...@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
u32 mkey; u32 mkey;
enum dma_data_direction dma_dir; enum dma_data_direction dma_dir;
u8 dmaed:1; u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm; struct list_head buf_elm;
struct mlx5_vf_migration_file *migf; struct mlx5_vf_migration_file *migf;
/* Optimize mlx5vf_get_migration_page() for sequential access */ /* Optimize mlx5vf_get_migration_page() for sequential access */
...@@ -82,6 +83,8 @@ struct mlx5vf_async_data { ...@@ -82,6 +83,8 @@ struct mlx5vf_async_data {
void *out; void *out;
}; };
#define MAX_NUM_CHUNKS 2
struct mlx5_vf_migration_file { struct mlx5_vf_migration_file {
struct file *filp; struct file *filp;
struct mutex lock; struct mutex lock;
...@@ -94,8 +97,9 @@ struct mlx5_vf_migration_file { ...@@ -94,8 +97,9 @@ struct mlx5_vf_migration_file {
u32 record_tag; u32 record_tag;
u64 stop_copy_prep_size; u64 stop_copy_prep_size;
u64 pre_copy_initial_bytes; u64 pre_copy_initial_bytes;
struct mlx5_vhca_data_buffer *buf; /* Upon chunk mode preserve another set of buffers for stop_copy phase */
struct mlx5_vhca_data_buffer *buf_header; struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
spinlock_t list_lock; spinlock_t list_lock;
struct list_head buf_list; struct list_head buf_list;
struct list_head avail_list; struct list_head avail_list;
......
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
/* Device specification max LOAD size */ /* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
#define MAX_CHUNK_SIZE SZ_8M
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{ {
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
...@@ -304,7 +306,8 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) ...@@ -304,7 +306,8 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
wake_up_interruptible(&migf->poll_wait); wake_up_interruptible(&migf->poll_wait);
} }
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
bool track)
{ {
size_t size = sizeof(struct mlx5_vf_migration_header) + size_t size = sizeof(struct mlx5_vf_migration_header) +
sizeof(struct mlx5_vf_migration_tag_stop_copy_data); sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
...@@ -331,7 +334,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) ...@@ -331,7 +334,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
to_buff = kmap_local_page(page); to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header)); memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header); header_buf->length = sizeof(header);
data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
memcpy(to_buff + sizeof(header), &data, sizeof(data)); memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data); header_buf->length += sizeof(data);
kunmap_local(to_buff); kunmap_local(to_buff);
...@@ -340,48 +343,83 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) ...@@ -340,48 +343,83 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
spin_lock_irqsave(&migf->list_lock, flags); spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&header_buf->buf_elm, &migf->buf_list); list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irqrestore(&migf->list_lock, flags); spin_unlock_irqrestore(&migf->list_lock, flags);
migf->pre_copy_initial_bytes = size; if (track)
migf->pre_copy_initial_bytes = size;
return 0; return 0;
err: err:
mlx5vf_put_data_buffer(header_buf); mlx5vf_put_data_buffer(header_buf);
return ret; return ret;
} }
static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
size_t state_size) struct mlx5_vf_migration_file *migf,
size_t state_size, u64 full_size,
bool track)
{ {
struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size; size_t inc_state_size;
int num_chunks;
int ret; int ret;
int i;
/* let's be ready for stop_copy size that might grow by 10 percents */ if (mvdev->chunk_mode) {
if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
inc_state_size = state_size;
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); /* from firmware perspective at least 'state_size' buffer should be set */
if (IS_ERR(buf)) inc_state_size = max(state_size, chunk_size);
return PTR_ERR(buf); } else {
if (track) {
/* let's be ready for stop_copy size that might grow by 10 percents */
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
inc_state_size = state_size;
} else {
inc_state_size = state_size;
}
}
migf->buf = buf; /* let's not overflow the device specification max SAVE size */
buf = mlx5vf_get_data_buffer(migf, inc_state_size = min_t(size_t, inc_state_size,
sizeof(struct mlx5_vf_migration_header), DMA_NONE); (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
if (IS_ERR(buf)) {
ret = PTR_ERR(buf); num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
goto err; for (i = 0; i < num_chunks; i++) {
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf[i] = buf;
buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf_header[i] = buf;
if (mvdev->chunk_mode) {
migf->buf[i]->stop_copy_chunk_num = i + 1;
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
}
} }
migf->buf_header = buf; ret = mlx5vf_add_stop_copy_header(migf, track);
ret = mlx5vf_add_stop_copy_header(migf);
if (ret) if (ret)
goto err_header; goto err;
return 0; return 0;
err_header:
mlx5vf_put_data_buffer(migf->buf_header);
migf->buf_header = NULL;
err: err:
mlx5vf_put_data_buffer(migf->buf); for (i = 0; i < num_chunks; i++) {
migf->buf = NULL; if (migf->buf[i]) {
mlx5vf_put_data_buffer(migf->buf[i]);
migf->buf[i] = NULL;
}
if (migf->buf_header[i]) {
mlx5vf_put_data_buffer(migf->buf_header[i]);
migf->buf_header[i] = NULL;
}
}
return ret; return ret;
} }
...@@ -511,9 +549,9 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) ...@@ -511,9 +549,9 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
goto err; goto err;
/* Checking whether we have a matching pre-allocated buffer that can fit */ /* Checking whether we have a matching pre-allocated buffer that can fit */
if (migf->buf && migf->buf->allocated_length >= length) { if (migf->buf[0]->allocated_length >= length) {
buf = migf->buf; buf = migf->buf[0];
migf->buf = NULL; migf->buf[0] = NULL;
} else { } else {
buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) { if (IS_ERR(buf)) {
...@@ -541,6 +579,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) ...@@ -541,6 +579,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
struct mlx5_vf_migration_file *migf; struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *buf;
size_t length; size_t length;
u64 full_size;
int ret; int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
...@@ -574,20 +613,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) ...@@ -574,20 +613,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list); INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock); spin_lock_init(&migf->list_lock);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 0); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
if (ret) if (ret)
goto out_pd; goto out_pd;
if (track) { if (track) {
ret = mlx5vf_prep_stop_copy(migf, length); /* leave the allocated buffer ready for the stop-copy phase */
if (ret) buf = mlx5vf_alloc_data_buffer(migf,
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd; goto out_pd;
} }
} else {
buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); buf = migf->buf[0];
if (IS_ERR(buf)) { migf->buf[0] = NULL;
ret = PTR_ERR(buf);
goto out_pd;
} }
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
...@@ -820,8 +864,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, ...@@ -820,8 +864,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos) size_t len, loff_t *pos)
{ {
struct mlx5_vf_migration_file *migf = filp->private_data; struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
loff_t requested_length; loff_t requested_length;
bool has_work = false; bool has_work = false;
ssize_t done = 0; ssize_t done = 0;
...@@ -856,15 +900,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, ...@@ -856,15 +900,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf_header->allocated_length < migf->record_size) { if (vhca_buf_header->allocated_length < migf->record_size) {
mlx5vf_free_data_buffer(vhca_buf_header); mlx5vf_free_data_buffer(vhca_buf_header);
migf->buf_header = mlx5vf_alloc_data_buffer(migf, migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
migf->record_size, DMA_NONE); migf->record_size, DMA_NONE);
if (IS_ERR(migf->buf_header)) { if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header); ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header = NULL; migf->buf_header[0] = NULL;
goto out_unlock; goto out_unlock;
} }
vhca_buf_header = migf->buf_header; vhca_buf_header = migf->buf_header[0];
} }
vhca_buf_header->start_pos = migf->max_pos; vhca_buf_header->start_pos = migf->max_pos;
...@@ -884,15 +928,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, ...@@ -884,15 +928,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf->allocated_length < size) { if (vhca_buf->allocated_length < size) {
mlx5vf_free_data_buffer(vhca_buf); mlx5vf_free_data_buffer(vhca_buf);
migf->buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
size, DMA_TO_DEVICE); size, DMA_TO_DEVICE);
if (IS_ERR(migf->buf)) { if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf); ret = PTR_ERR(migf->buf[0]);
migf->buf = NULL; migf->buf[0] = NULL;
goto out_unlock; goto out_unlock;
} }
vhca_buf = migf->buf; vhca_buf = migf->buf[0];
} }
vhca_buf->start_pos = migf->max_pos; vhca_buf->start_pos = migf->max_pos;
...@@ -974,7 +1018,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) ...@@ -974,7 +1018,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_pd; goto out_pd;
} }
migf->buf = buf; migf->buf[0] = buf;
if (MLX5VF_PRE_COPY_SUPP(mvdev)) { if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
buf = mlx5vf_alloc_data_buffer(migf, buf = mlx5vf_alloc_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE); sizeof(struct mlx5_vf_migration_header), DMA_NONE);
...@@ -983,7 +1027,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) ...@@ -983,7 +1027,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_buf; goto out_buf;
} }
migf->buf_header = buf; migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
} else { } else {
/* Initial state will be to read the image */ /* Initial state will be to read the image */
...@@ -997,7 +1041,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) ...@@ -997,7 +1041,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
spin_lock_init(&migf->list_lock); spin_lock_init(&migf->list_lock);
return migf; return migf;
out_buf: out_buf:
mlx5vf_free_data_buffer(migf->buf); mlx5vf_free_data_buffer(migf->buf[0]);
out_pd: out_pd:
mlx5vf_cmd_dealloc_pd(migf); mlx5vf_cmd_dealloc_pd(migf);
out_free: out_free:
...@@ -1101,7 +1145,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, ...@@ -1101,7 +1145,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
ret = mlx5vf_cmd_load_vhca_state(mvdev, ret = mlx5vf_cmd_load_vhca_state(mvdev,
mvdev->resuming_migf, mvdev->resuming_migf,
mvdev->resuming_migf->buf); mvdev->resuming_migf->buf[0]);
if (ret) if (ret)
return ERR_PTR(ret); return ERR_PTR(ret);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment