Commit 64145482 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio updates from Michael Tsirkin:

 - vdpa sim refactoring

 - virtio mem: Big Block Mode support

 - misc cleanus, fixes

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (61 commits)
  vdpa: Use simpler version of ida allocation
  vdpa: Add missing comment for virtqueue count
  uapi: virtio_ids: add missing device type IDs from OASIS spec
  uapi: virtio_ids.h: consistent indentions
  vhost scsi: fix error return code in vhost_scsi_set_endpoint()
  virtio_ring: Fix two use after free bugs
  virtio_net: Fix error code in probe()
  virtio_ring: Cut and paste bugs in vring_create_virtqueue_packed()
  tools/virtio: add barrier for aarch64
  tools/virtio: add krealloc_array
  tools/virtio: include asm/bug.h
  vdpa/mlx5: Use write memory barrier after updating CQ index
  vdpa: split vdpasim to core and net modules
  vdpa_sim: split vdpasim_virtqueue's iov field in out_iov and in_iov
  vdpa_sim: make vdpasim->buffer size configurable
  vdpa_sim: use kvmalloc to allocate vdpasim->buffer
  vdpa_sim: set vringh notify callback
  vdpa_sim: add set_config callback in vdpasim_dev_attr
  vdpa_sim: add get_config callback in vdpasim_dev_attr
  vdpa_sim: make 'config' generic and usable for any device type
  ...
parents 58cf05f5 418eddef
......@@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev)
dev_err(&vdev->dev,
"device MTU appears to have changed it is now %d < %d",
mtu, dev->min_mtu);
err = -EINVAL;
goto free;
}
......
......@@ -9,21 +9,24 @@ menuconfig VDPA
if VDPA
config VDPA_SIM
tristate "vDPA device simulator"
tristate "vDPA device simulator core"
depends on RUNTIME_TESTING_MENU && HAS_DMA
select DMA_OPS
select VHOST_RING
help
Enable this module to support vDPA device simulators. These devices
are used for testing, prototyping and development of vDPA.
config VDPA_SIM_NET
tristate "vDPA simulator for networking device"
depends on VDPA_SIM
select GENERIC_NET_UTILS
default n
help
vDPA networking device simulator which loop TX traffic back
to RX. This device is used for testing, prototyping and
development of vDPA.
vDPA networking device simulator which loops TX traffic back to RX.
config IFCVF
tristate "Intel IFC VF vDPA driver"
depends on PCI_MSI
default n
help
This kernel module can drive Intel IFC VF NIC to offload
virtio dataplane traffic to hardware.
......@@ -42,7 +45,6 @@ config MLX5_VDPA_NET
tristate "vDPA driver for ConnectX devices"
select MLX5_VDPA
depends on MLX5_CORE
default n
help
VDPA network driver for ConnectX6 and newer. Provides offloading
of virtio net datapath such that descriptors put on the ring will
......
......@@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return ret;
}
ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
if (ret) {
IFCVF_ERR(pdev, "No usable DMA confiugration\n");
return ret;
}
ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
if (ret) {
IFCVF_ERR(pdev,
"No usable coherent DMA confiugration\n");
IFCVF_ERR(pdev, "No usable DMA configuration\n");
return ret;
}
......
......@@ -479,6 +479,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
{
mlx5_cq_set_ci(&mvq->cq.mcq);
/* make sure CQ cosumer update is visible to the hardware before updating
* RX doorbell record.
*/
dma_wmb();
rx_post(&mvq->vqqp, num);
if (mvq->event_cb.callback)
mvq->event_cb.callback(mvq->event_cb.private);
......
......@@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
if (!vdev)
goto err;
err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
err = ida_alloc(&vdpa_index_ida, GFP_KERNEL);
if (err < 0)
goto err_ida;
......
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
// SPDX-License-Identifier: GPL-2.0-only
/*
* VDPA networking device simulator.
* VDPA device simulator core.
*
* Copyright (c) 2020, Red Hat Inc. All rights reserved.
* Author: Jason Wang <jasowang@redhat.com>
......@@ -11,97 +11,32 @@
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/iommu.h>
#include <linux/dma-map-ops.h>
#include <linux/sysfs.h>
#include <linux/file.h>
#include <linux/etherdevice.h>
#include <linux/vringh.h>
#include <linux/vdpa.h>
#include <linux/virtio_byteorder.h>
#include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
#include <uapi/linux/virtio_net.h>
#include "vdpa_sim.h"
#define DRV_VERSION "0.1"
#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
#define DRV_DESC "vDPA Device Simulator"
#define DRV_DESC "vDPA Device Simulator core"
#define DRV_LICENSE "GPL v2"
static int batch_mapping = 1;
module_param(batch_mapping, int, 0444);
MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable");
static char *macaddr;
module_param(macaddr, charp, 0);
MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
struct vdpasim_virtqueue {
struct vringh vring;
struct vringh_kiov iov;
unsigned short head;
bool ready;
u64 desc_addr;
u64 device_addr;
u64 driver_addr;
u32 num;
void *private;
irqreturn_t (*cb)(void *data);
};
static int max_iotlb_entries = 2048;
module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(max_iotlb_entries,
"Maximum number of iotlb entries. 0 means unlimited. (default: 2048)");
#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
#define VDPASIM_QUEUE_MAX 256
#define VDPASIM_DEVICE_ID 0x1
#define VDPASIM_VENDOR_ID 0
#define VDPASIM_VQ_NUM 0x2
#define VDPASIM_NAME "vdpasim-netdev"
static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
(1ULL << VIRTIO_F_VERSION_1) |
(1ULL << VIRTIO_F_ACCESS_PLATFORM) |
(1ULL << VIRTIO_NET_F_MAC);
/* State of each vdpasim device */
struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
struct work_struct work;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
struct virtio_net_config config;
struct vhost_iotlb *iommu;
void *buffer;
u32 status;
u32 generation;
u64 features;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
};
/* TODO: cross-endian support */
static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
{
return virtio_legacy_is_little_endian() ||
(vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
}
static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
{
return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
{
return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
}
static struct vdpasim *vdpasim_dev;
static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
{
......@@ -115,20 +50,34 @@ static struct vdpasim *dev_to_sim(struct device *dev)
return vdpa_to_sim(vdpa);
}
static void vdpasim_vq_notify(struct vringh *vring)
{
struct vdpasim_virtqueue *vq =
container_of(vring, struct vdpasim_virtqueue, vring);
if (!vq->cb)
return;
vq->cb(vq->private);
}
static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
{
struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
vringh_init_iotlb(&vq->vring, vdpasim_features,
vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
VDPASIM_QUEUE_MAX, false,
(struct vring_desc *)(uintptr_t)vq->desc_addr,
(struct vring_avail *)
(uintptr_t)vq->driver_addr,
(struct vring_used *)
(uintptr_t)vq->device_addr);
vq->vring.notify = vdpasim_vq_notify;
}
static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
static void vdpasim_vq_reset(struct vdpasim *vdpasim,
struct vdpasim_virtqueue *vq)
{
vq->ready = false;
vq->desc_addr = 0;
......@@ -136,16 +85,18 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
vq->device_addr = 0;
vq->cb = NULL;
vq->private = NULL;
vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX,
false, NULL, NULL, NULL);
vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
vq->vring.notify = NULL;
}
static void vdpasim_reset(struct vdpasim *vdpasim)
{
int i;
for (i = 0; i < VDPASIM_VQ_NUM; i++)
vdpasim_vq_reset(&vdpasim->vqs[i]);
for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]);
spin_lock(&vdpasim->iommu_lock);
vhost_iotlb_reset(vdpasim->iommu);
......@@ -156,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
++vdpasim->generation;
}
static void vdpasim_work(struct work_struct *work)
{
struct vdpasim *vdpasim = container_of(work, struct
vdpasim, work);
struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
ssize_t read, write;
size_t total_write;
int pkts = 0;
int err;
spin_lock(&vdpasim->lock);
if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
goto out;
if (!txq->ready || !rxq->ready)
goto out;
while (true) {
total_write = 0;
err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
&txq->head, GFP_ATOMIC);
if (err <= 0)
break;
err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
&rxq->head, GFP_ATOMIC);
if (err <= 0) {
vringh_complete_iotlb(&txq->vring, txq->head, 0);
break;
}
while (true) {
read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
vdpasim->buffer,
PAGE_SIZE);
if (read <= 0)
break;
write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
vdpasim->buffer, read);
if (write <= 0)
break;
total_write += write;
}
/* Make sure data is wrote before advancing index */
smp_wmb();
vringh_complete_iotlb(&txq->vring, txq->head, 0);
vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
/* Make sure used is visible before rasing the interrupt. */
smp_wmb();
local_bh_disable();
if (txq->cb)
txq->cb(txq->private);
if (rxq->cb)
rxq->cb(rxq->private);
local_bh_enable();
if (++pkts > 4) {
schedule_work(&vdpasim->work);
goto out;
}
}
out:
spin_unlock(&vdpasim->lock);
}
static int dir_to_perm(enum dma_data_direction dir)
{
int perm = -EFAULT;
......@@ -342,26 +219,28 @@ static const struct dma_map_ops vdpasim_dma_ops = {
.free = vdpasim_free_coherent,
};
static const struct vdpa_config_ops vdpasim_net_config_ops;
static const struct vdpa_config_ops vdpasim_net_batch_config_ops;
static const struct vdpa_config_ops vdpasim_config_ops;
static const struct vdpa_config_ops vdpasim_batch_config_ops;
static struct vdpasim *vdpasim_create(void)
struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
{
const struct vdpa_config_ops *ops;
struct vdpasim *vdpasim;
struct device *dev;
int ret = -ENOMEM;
int i, ret = -ENOMEM;
if (batch_mapping)
ops = &vdpasim_net_batch_config_ops;
ops = &vdpasim_batch_config_ops;
else
ops = &vdpasim_net_config_ops;
ops = &vdpasim_config_ops;
vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM);
vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
dev_attr->nvqs);
if (!vdpasim)
goto err_alloc;
INIT_WORK(&vdpasim->work, vdpasim_work);
vdpasim->dev_attr = *dev_attr;
INIT_WORK(&vdpasim->work, dev_attr->work_fn);
spin_lock_init(&vdpasim->lock);
spin_lock_init(&vdpasim->iommu_lock);
......@@ -371,31 +250,27 @@ static struct vdpasim *vdpasim_create(void)
goto err_iommu;
set_dma_ops(dev, &vdpasim_dma_ops);
vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
if (!vdpasim->iommu)
vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
if (!vdpasim->config)
goto err_iommu;
vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!vdpasim->buffer)
vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
GFP_KERNEL);
if (!vdpasim->vqs)
goto err_iommu;
if (macaddr) {
mac_pton(macaddr, vdpasim->config.mac);
if (!is_valid_ether_addr(vdpasim->config.mac)) {
ret = -EADDRNOTAVAIL;
vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0);
if (!vdpasim->iommu)
goto err_iommu;
vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL);
if (!vdpasim->buffer)
goto err_iommu;
}
} else {
eth_random_addr(vdpasim->config.mac);
}
vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
for (i = 0; i < dev_attr->nvqs; i++)
vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
vdpasim->vdpa.dma_dev = dev;
ret = vdpa_register_device(&vdpasim->vdpa);
if (ret)
goto err_iommu;
return vdpasim;
......@@ -404,6 +279,7 @@ static struct vdpasim *vdpasim_create(void)
err_alloc:
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(vdpasim_create);
static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 desc_area, u64 driver_area,
......@@ -498,28 +374,21 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
static u64 vdpasim_get_features(struct vdpa_device *vdpa)
{
return vdpasim_features;
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
return vdpasim->dev_attr.supported_features;
}
static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
{
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
struct virtio_net_config *config = &vdpasim->config;
/* DMA mapping must be done by driver */
if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
return -EINVAL;
vdpasim->features = features & vdpasim_features;
vdpasim->features = features & vdpasim->dev_attr.supported_features;
/* We generally only know whether guest is using the legacy interface
* here, so generally that's the earliest we can set config fields.
* Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which
* implies VIRTIO_F_VERSION_1, but let's not try to be clever here.
*/
config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
return 0;
}
......@@ -536,7 +405,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
{
return VDPASIM_DEVICE_ID;
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
return vdpasim->dev_attr.id;
}
static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
......@@ -572,14 +443,27 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
{
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
if (offset + len < sizeof(struct virtio_net_config))
memcpy(buf, (u8 *)&vdpasim->config + offset, len);
if (offset + len > vdpasim->dev_attr.config_size)
return;
if (vdpasim->dev_attr.get_config)
vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
memcpy(buf, vdpasim->config + offset, len);
}
static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
const void *buf, unsigned int len)
{
/* No writable config supportted by vdpasim */
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
if (offset + len > vdpasim->dev_attr.config_size)
return;
memcpy(vdpasim->config + offset, buf, len);
if (vdpasim->dev_attr.set_config)
vdpasim->dev_attr.set_config(vdpasim, vdpasim->config);
}
static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
......@@ -656,12 +540,14 @@ static void vdpasim_free(struct vdpa_device *vdpa)
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
cancel_work_sync(&vdpasim->work);
kfree(vdpasim->buffer);
kvfree(vdpasim->buffer);
if (vdpasim->iommu)
vhost_iotlb_free(vdpasim->iommu);
kfree(vdpasim->vqs);
kfree(vdpasim->config);
}
static const struct vdpa_config_ops vdpasim_net_config_ops = {
static const struct vdpa_config_ops vdpasim_config_ops = {
.set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num,
.kick_vq = vdpasim_kick_vq,
......@@ -688,7 +574,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = {
.free = vdpasim_free,
};
static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
static const struct vdpa_config_ops vdpasim_batch_config_ops = {
.set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num,
.kick_vq = vdpasim_kick_vq,
......@@ -714,26 +600,6 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
.free = vdpasim_free,
};
static int __init vdpasim_dev_init(void)
{
vdpasim_dev = vdpasim_create();
if (!IS_ERR(vdpasim_dev))
return 0;
return PTR_ERR(vdpasim_dev);
}
static void __exit vdpasim_dev_exit(void)
{
struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
vdpa_unregister_device(vdpa);
}
module_init(vdpasim_dev_init)
module_exit(vdpasim_dev_exit)
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE(DRV_LICENSE);
MODULE_AUTHOR(DRV_AUTHOR);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2020, Red Hat Inc. All rights reserved.
*/
#ifndef _VDPA_SIM_H
#define _VDPA_SIM_H
#include <linux/vringh.h>
#include <linux/vdpa.h>
#include <linux/virtio_byteorder.h>
#include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
(1ULL << VIRTIO_F_VERSION_1) | \
(1ULL << VIRTIO_F_ACCESS_PLATFORM))
struct vdpasim;
struct vdpasim_virtqueue {
struct vringh vring;
struct vringh_kiov in_iov;
struct vringh_kiov out_iov;
unsigned short head;
bool ready;
u64 desc_addr;
u64 device_addr;
u64 driver_addr;
u32 num;
void *private;
irqreturn_t (*cb)(void *data);
};
struct vdpasim_dev_attr {
u64 supported_features;
size_t config_size;
size_t buffer_size;
int nvqs;
u32 id;
work_func_t work_fn;
void (*get_config)(struct vdpasim *vdpasim, void *config);
void (*set_config)(struct vdpasim *vdpasim, const void *config);
};
/* State of each vdpasim device */
struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue *vqs;
struct work_struct work;
struct vdpasim_dev_attr dev_attr;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
/* virtio config according to device type */
void *config;
struct vhost_iotlb *iommu;
void *buffer;
u32 status;
u32 generation;
u64 features;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
};
struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr);
/* TODO: cross-endian support */
static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
{
return virtio_legacy_is_little_endian() ||
(vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
}
static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
{
return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
{
return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
}
static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val)
{
return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val)
{
return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val);
}
static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val)
{
return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val)
{
return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val);
}
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* VDPA simulator for networking device.
*
* Copyright (c) 2020, Red Hat Inc. All rights reserved.
* Author: Jason Wang <jasowang@redhat.com>
*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/etherdevice.h>
#include <linux/vringh.h>
#include <linux/vdpa.h>
#include <uapi/linux/virtio_net.h>
#include "vdpa_sim.h"
#define DRV_VERSION "0.1"
#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
#define DRV_DESC "vDPA Device Simulator for networking device"
#define DRV_LICENSE "GPL v2"
#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \
(1ULL << VIRTIO_NET_F_MAC))
#define VDPASIM_NET_VQ_NUM 2
static char *macaddr;
module_param(macaddr, charp, 0);
MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
u8 macaddr_buf[ETH_ALEN];
static struct vdpasim *vdpasim_net_dev;
static void vdpasim_net_work(struct work_struct *work)
{
struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
ssize_t read, write;
size_t total_write;
int pkts = 0;
int err;
spin_lock(&vdpasim->lock);
if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
goto out;
if (!txq->ready || !rxq->ready)
goto out;
while (true) {
total_write = 0;
err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL,
&txq->head, GFP_ATOMIC);
if (err <= 0)
break;
err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov,
&rxq->head, GFP_ATOMIC);
if (err <= 0) {
vringh_complete_iotlb(&txq->vring, txq->head, 0);
break;
}
while (true) {
read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov,
vdpasim->buffer,
PAGE_SIZE);
if (read <= 0)
break;
write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov,
vdpasim->buffer, read);
if (write <= 0)
break;
total_write += write;
}
/* Make sure data is wrote before advancing index */
smp_wmb();
vringh_complete_iotlb(&txq->vring, txq->head, 0);
vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
/* Make sure used is visible before rasing the interrupt. */
smp_wmb();
local_bh_disable();
if (vringh_need_notify_iotlb(&txq->vring) > 0)
vringh_notify(&txq->vring);
if (vringh_need_notify_iotlb(&rxq->vring) > 0)
vringh_notify(&rxq->vring);
local_bh_enable();
if (++pkts > 4) {
schedule_work(&vdpasim->work);
goto out;
}
}
out:
spin_unlock(&vdpasim->lock);
}
static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
{
struct virtio_net_config *net_config =
(struct virtio_net_config *)config;
net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
memcpy(net_config->mac, macaddr_buf, ETH_ALEN);
}
static int __init vdpasim_net_init(void)
{
struct vdpasim_dev_attr dev_attr = {};
int ret;
if (macaddr) {
mac_pton(macaddr, macaddr_buf);
if (!is_valid_ether_addr(macaddr_buf)) {
ret = -EADDRNOTAVAIL;
goto out;
}
} else {
eth_random_addr(macaddr_buf);
}
dev_attr.id = VIRTIO_ID_NET;
dev_attr.supported_features = VDPASIM_NET_FEATURES;
dev_attr.nvqs = VDPASIM_NET_VQ_NUM;
dev_attr.config_size = sizeof(struct virtio_net_config);
dev_attr.get_config = vdpasim_net_get_config;
dev_attr.work_fn = vdpasim_net_work;
dev_attr.buffer_size = PAGE_SIZE;
vdpasim_net_dev = vdpasim_create(&dev_attr);
if (IS_ERR(vdpasim_net_dev)) {
ret = PTR_ERR(vdpasim_net_dev);
goto out;
}
ret = vdpa_register_device(&vdpasim_net_dev->vdpa);
if (ret)
goto put_dev;
return 0;
put_dev:
put_device(&vdpasim_net_dev->vdpa.dev);
out:
return ret;
}
static void __exit vdpasim_net_exit(void)
{
struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa;
vdpa_unregister_device(vdpa);
}
module_init(vdpasim_net_init);
module_exit(vdpasim_net_exit);
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE(DRV_LICENSE);
MODULE_AUTHOR(DRV_AUTHOR);
MODULE_DESCRIPTION(DRV_DESC);
......@@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
if (!vhost_vq_is_setup(vq))
continue;
if (vhost_scsi_setup_vq_cmds(vq, vq->num))
ret = vhost_scsi_setup_vq_cmds(vq, vq->num);
if (ret)
goto destroy_vq_cmds;
}
......
......@@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
return -EFAULT;
if (vhost_vdpa_config_validate(v, &config))
return -EINVAL;
buf = kvzalloc(config.len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_user(buf, c->buf, config.len)) {
kvfree(buf);
return -EFAULT;
}
buf = vmemdup_user(c->buf, config.len);
if (IS_ERR(buf))
return PTR_ERR(buf);
ops->set_config(vdpa, config.off, buf, config.len);
......
......@@ -27,20 +27,74 @@ static bool unplug_online = true;
module_param(unplug_online, bool, 0644);
MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
enum virtio_mem_mb_state {
static bool force_bbm;
module_param(force_bbm, bool, 0444);
MODULE_PARM_DESC(force_bbm,
"Force Big Block Mode. Default is 0 (auto-selection)");
static unsigned long bbm_block_size;
module_param(bbm_block_size, ulong, 0444);
MODULE_PARM_DESC(bbm_block_size,
"Big Block size in bytes. Default is 0 (auto-detection).");
static bool bbm_safe_unplug = true;
module_param(bbm_safe_unplug, bool, 0444);
MODULE_PARM_DESC(bbm_safe_unplug,
"Use a safe unplug mechanism in BBM, avoiding long/endless loops");
/*
* virtio-mem currently supports the following modes of operation:
*
* * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
* size of a Sub Block (SB) is determined based on the device block size, the
* pageblock size, and the maximum allocation granularity of the buddy.
* Subblocks within a Linux memory block might either be plugged or unplugged.
* Memory is added/removed to Linux MM in Linux memory block granularity.
*
* * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
* Memory is added/removed to Linux MM in Big Block granularity.
*
* The mode is determined automatically based on the Linux memory block size
* and the device block size.
*
* User space / core MM (auto onlining) is responsible for onlining added
* Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
* always onlined separately, and all memory within a Linux memory block is
* onlined to the same zone - virtio-mem relies on this behavior.
*/
/*
* State of a Linux memory block in SBM.
*/
enum virtio_mem_sbm_mb_state {
/* Unplugged, not added to Linux. Can be reused later. */
VIRTIO_MEM_MB_STATE_UNUSED = 0,
VIRTIO_MEM_SBM_MB_UNUSED = 0,
/* (Partially) plugged, not added to Linux. Error on add_memory(). */
VIRTIO_MEM_MB_STATE_PLUGGED,
VIRTIO_MEM_SBM_MB_PLUGGED,
/* Fully plugged, fully added to Linux, offline. */
VIRTIO_MEM_MB_STATE_OFFLINE,
VIRTIO_MEM_SBM_MB_OFFLINE,
/* Partially plugged, fully added to Linux, offline. */
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
/* Fully plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE,
VIRTIO_MEM_SBM_MB_ONLINE,
/* Partially plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
VIRTIO_MEM_MB_STATE_COUNT
VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL,
VIRTIO_MEM_SBM_MB_COUNT
};
/*
* State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
*/
enum virtio_mem_bbm_bb_state {
/* Unplugged, not added to Linux. Can be reused later. */
VIRTIO_MEM_BBM_BB_UNUSED = 0,
/* Plugged, not added to Linux. Error on add_memory(). */
VIRTIO_MEM_BBM_BB_PLUGGED,
/* Plugged and added to Linux. */
VIRTIO_MEM_BBM_BB_ADDED,
/* All online parts are fake-offline, ready to remove. */
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
VIRTIO_MEM_BBM_BB_COUNT
};
struct virtio_mem {
......@@ -51,6 +105,7 @@ struct virtio_mem {
/* Workqueue that processes the plug/unplug requests. */
struct work_struct wq;
atomic_t wq_active;
atomic_t config_changed;
/* Virtqueue for guest->host requests. */
......@@ -70,60 +125,94 @@ struct virtio_mem {
/* The device block size (for communicating with the device). */
uint64_t device_block_size;
/* The translated node id. NUMA_NO_NODE in case not specified. */
/* The determined node id for all memory of the device. */
int nid;
/* Physical start address of the memory region. */
uint64_t addr;
/* Maximum region size in bytes. */
uint64_t region_size;
/* The subblock size. */
uint64_t subblock_size;
/* The number of subblocks per memory block. */
uint32_t nb_sb_per_mb;
/* The parent resource for all memory added via this device. */
struct resource *parent_resource;
/*
* Copy of "System RAM (virtio_mem)" to be used for
* add_memory_driver_managed().
*/
const char *resource_name;
/*
* We don't want to add too much memory if it's not getting onlined,
* to avoid running OOM. Besides this threshold, we allow to have at
* least two offline blocks at a time (whatever is bigger).
*/
#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
atomic64_t offline_size;
uint64_t offline_threshold;
/* If set, the driver is in SBM, otherwise in BBM. */
bool in_sbm;
union {
struct {
/* Id of the first memory block of this device. */
unsigned long first_mb_id;
/* Id of the last memory block of this device. */
unsigned long last_mb_id;
/* Id of the last usable memory block of this device. */
unsigned long last_usable_mb_id;
/* Id of the next memory bock to prepare when needed. */
unsigned long next_mb_id;
/* The parent resource for all memory added via this device. */
struct resource *parent_resource;
/*
* Copy of "System RAM (virtio_mem)" to be used for
* add_memory_driver_managed().
*/
const char *resource_name;
/* The subblock size. */
uint64_t sb_size;
/* The number of subblocks per Linux memory block. */
uint32_t sbs_per_mb;
/* Summary of all memory block states. */
unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10
unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
/*
* One byte state per memory block.
*
* Allocated via vmalloc(). When preparing new blocks, resized
* (alloc+copy+free) when needed (crossing pages with the next mb).
* (when crossing pages).
* One byte state per memory block. Allocated via
* vmalloc(). Resized (alloc+copy+free) on demand.
*
* With 128MB memory blocks, we have states for 512GB of memory in one
* page.
* With 128 MiB memory blocks, we have states for 512
* GiB of memory in one 4 KiB page.
*/
uint8_t *mb_state;
uint8_t *mb_states;
/*
* $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
* Bitmap: one bit per subblock. Allocated similar to
* sbm.mb_states.
*
* A set bit means the corresponding subblock is
* plugged, otherwise it's unblocked.
*
* With 4MB subblocks, we manage 128GB of memory in one page.
* With 4 MiB subblocks, we manage 128 GiB of memory
* in one 4 KiB page.
*/
unsigned long *sb_bitmap;
unsigned long *sb_states;
} sbm;
struct {
/* Id of the first big block of this device. */
unsigned long first_bb_id;
/* Id of the last usable big block of this device. */
unsigned long last_usable_bb_id;
/* Id of the next device bock to prepare when needed. */
unsigned long next_bb_id;
/* Summary of all big block states. */
unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
/* One byte state per big block. See sbm.mb_states. */
uint8_t *bb_states;
/* The block size used for plugging/adding/removing. */
uint64_t bb_size;
} bbm;
};
/*
* Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
* Mutex that protects the sbm.mb_count, sbm.mb_states,
* sbm.sb_states, bbm.bb_count, and bbm.bb_states
*
* When this lock is held the pointers can't change, ONLINE and
* OFFLINE blocks can't change the state and no subblocks will get
......@@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex);
static LIST_HEAD(virtio_mem_devices);
static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
unsigned long nr_pages);
static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
unsigned long nr_pages);
static void virtio_mem_retry(struct virtio_mem *vm);
/*
* Register a virtio-mem device so it will be considered for the online_page
......@@ -212,6 +306,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
return mb_id * memory_block_size_bytes();
}
/*
* Calculate the big block id of a given address.
*/
static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
uint64_t addr)
{
return addr / vm->bbm.bb_size;
}
/*
* Calculate the physical start address of a given big block id.
*/
static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
unsigned long bb_id)
{
return bb_id * vm->bbm.bb_size;
}
/*
* Calculate the subblock id of a given address.
*/
......@@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
return (addr - mb_addr) / vm->subblock_size;
return (addr - mb_addr) / vm->sbm.sb_size;
}
/*
* Set the state of a big block, taking care of the state counter.
*/
static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
unsigned long bb_id,
enum virtio_mem_bbm_bb_state state)
{
const unsigned long idx = bb_id - vm->bbm.first_bb_id;
enum virtio_mem_bbm_bb_state old_state;
old_state = vm->bbm.bb_states[idx];
vm->bbm.bb_states[idx] = state;
BUG_ON(vm->bbm.bb_count[old_state] == 0);
vm->bbm.bb_count[old_state]--;
vm->bbm.bb_count[state]++;
}
/*
* Get the state of a big block.
*/
static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
unsigned long bb_id)
{
return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
}
/*
* Prepare the big block state array for the next big block.
*/
static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
{
unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
unsigned long new_bytes = old_bytes + 1;
int old_pages = PFN_UP(old_bytes);
int new_pages = PFN_UP(new_bytes);
uint8_t *new_array;
if (vm->bbm.bb_states && old_pages == new_pages)
return 0;
new_array = vzalloc(new_pages * PAGE_SIZE);
if (!new_array)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
if (vm->bbm.bb_states)
memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
vfree(vm->bbm.bb_states);
vm->bbm.bb_states = new_array;
mutex_unlock(&vm->hotplug_mutex);
return 0;
}
#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
for (_bb_id = vm->bbm.first_bb_id; \
_bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
_bb_id++) \
if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
for (_bb_id = vm->bbm.next_bb_id - 1; \
_bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
_bb_id--) \
if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
/*
* Set the state of a memory block, taking care of the state counter.
*/
static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id,
enum virtio_mem_mb_state state)
static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
unsigned long mb_id, uint8_t state)
{
const unsigned long idx = mb_id - vm->first_mb_id;
enum virtio_mem_mb_state old_state;
const unsigned long idx = mb_id - vm->sbm.first_mb_id;
uint8_t old_state;
old_state = vm->mb_state[idx];
vm->mb_state[idx] = state;
old_state = vm->sbm.mb_states[idx];
vm->sbm.mb_states[idx] = state;
BUG_ON(vm->nb_mb_state[old_state] == 0);
vm->nb_mb_state[old_state]--;
vm->nb_mb_state[state]++;
BUG_ON(vm->sbm.mb_count[old_state] == 0);
vm->sbm.mb_count[old_state]--;
vm->sbm.mb_count[state]++;
}
/*
* Get the state of a memory block.
*/
static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm,
static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
unsigned long mb_id)
{
const unsigned long idx = mb_id - vm->first_mb_id;
const unsigned long idx = mb_id - vm->sbm.first_mb_id;
return vm->mb_state[idx];
return vm->sbm.mb_states[idx];
}
/*
* Prepare the state array for the next memory block.
*/
static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
{
unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
int old_pages = PFN_UP(old_bytes);
int new_pages = PFN_UP(new_bytes);
uint8_t *new_mb_state;
int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
uint8_t *new_array;
if (vm->mb_state && old_pages == new_pages)
if (vm->sbm.mb_states && old_pages == new_pages)
return 0;
new_mb_state = vzalloc(new_pages * PAGE_SIZE);
if (!new_mb_state)
new_array = vzalloc(new_pages * PAGE_SIZE);
if (!new_array)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
if (vm->mb_state)
memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE);
vfree(vm->mb_state);
vm->mb_state = new_mb_state;
if (vm->sbm.mb_states)
memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
vfree(vm->sbm.mb_states);
vm->sbm.mb_states = new_array;
mutex_unlock(&vm->hotplug_mutex);
return 0;
}
#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
for (_mb_id = _vm->first_mb_id; \
_mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
for (_mb_id = _vm->sbm.first_mb_id; \
_mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id++) \
if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
for (_mb_id = _vm->next_mb_id - 1; \
_mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
for (_mb_id = _vm->sbm.next_mb_id - 1; \
_mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id--) \
if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
/*
* Calculate the bit number in the subblock bitmap for the given subblock
* inside the given memory block.
*/
static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
unsigned long mb_id, int sb_id)
{
return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
}
/*
* Mark all selected subblocks plugged.
*
* Will not modify the state of the memory block.
*/
static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id,
int count)
{
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
__bitmap_set(vm->sb_bitmap, bit, count);
__bitmap_set(vm->sbm.sb_states, bit, count);
}
/*
......@@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
*
* Will not modify the state of the memory block.
*/
static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id,
int count)
{
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
__bitmap_clear(vm->sb_bitmap, bit, count);
__bitmap_clear(vm->sbm.sb_states, bit, count);
}
/*
* Test if all selected subblocks are plugged.
*/
static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id,
int count)
{
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
if (count == 1)
return test_bit(bit, vm->sb_bitmap);
return test_bit(bit, vm->sbm.sb_states);
/* TODO: Helper similar to bitmap_set() */
return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
bit + count;
}
/*
* Test if all selected subblocks are unplugged.
*/
static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id,
int count)
{
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
/* TODO: Helper similar to bitmap_set() */
return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count;
return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
bit + count;
}
/*
* Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
* Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
* none.
*/
static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm,
static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
unsigned long mb_id)
{
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) -
bit;
return find_next_zero_bit(vm->sbm.sb_states,
bit + vm->sbm.sbs_per_mb, bit) - bit;
}
/*
* Prepare the subblock bitmap for the next memory block.
*/
static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm)
static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
{
const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
unsigned long *new_sb_bitmap, *old_sb_bitmap;
unsigned long *new_bitmap, *old_bitmap;
if (vm->sb_bitmap && old_pages == new_pages)
if (vm->sbm.sb_states && old_pages == new_pages)
return 0;
new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE);
if (!new_sb_bitmap)
new_bitmap = vzalloc(new_pages * PAGE_SIZE);
if (!new_bitmap)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
if (new_sb_bitmap)
memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE);
if (new_bitmap)
memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
old_sb_bitmap = vm->sb_bitmap;
vm->sb_bitmap = new_sb_bitmap;
old_bitmap = vm->sbm.sb_states;
vm->sbm.sb_states = new_bitmap;
mutex_unlock(&vm->hotplug_mutex);
vfree(old_sb_bitmap);
vfree(old_bitmap);
return 0;
}
/*
* Try to add a memory block to Linux. This will usually only fail
* if out of memory.
* Test if we could add memory without creating too much offline memory -
* to avoid running OOM if memory is getting onlined deferred.
*/
static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
{
if (WARN_ON_ONCE(size > vm->offline_threshold))
return false;
return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
}
/*
* Try adding memory to Linux. Will usually only fail if out of memory.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
* Will not modify the state of the memory block.
* Will not modify the state of memory blocks in virtio-mem.
*/
static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
uint64_t size)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
int nid = vm->nid;
if (nid == NUMA_NO_NODE)
nid = memory_add_physaddr_to_nid(addr);
int rc;
/*
* When force-unloading the driver and we still have memory added to
......@@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
return -ENOMEM;
}
dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
vm->resource_name,
dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
/* Memory might get onlined immediately. */
atomic64_add(size, &vm->offline_size);
rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
MEMHP_MERGE_RESOURCE);
if (rc) {
atomic64_sub(size, &vm->offline_size);
dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
/*
* TODO: Linux MM does not properly clean up yet in all cases
* where adding of memory failed - especially on -ENOMEM.
*/
}
return rc;
}
/*
* See virtio_mem_add_memory(): Try adding a single Linux memory block.
*/
static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
const uint64_t size = memory_block_size_bytes();
return virtio_mem_add_memory(vm, addr, size);
}
/*
* Try to remove a memory block from Linux. Will only fail if the memory block
* is not offline.
* See virtio_mem_add_memory(): Try adding a big block.
*/
static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_add_memory(vm, addr, size);
}
/*
* Try removing memory from Linux. Will only fail if memory blocks aren't
* offline.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
* Will not modify the state of the memory block.
* Will not modify the state of memory blocks in virtio-mem.
*/
static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
uint64_t size)
{
int rc;
dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
rc = remove_memory(vm->nid, addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
* We might have freed up memory we can now unplug, retry
* immediately instead of waiting.
*/
virtio_mem_retry(vm);
} else {
dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
}
return rc;
}
/*
* See virtio_mem_remove_memory(): Try removing a single Linux memory block.
*/
static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
int nid = vm->nid;
const uint64_t size = memory_block_size_bytes();
if (nid == NUMA_NO_NODE)
nid = memory_add_physaddr_to_nid(addr);
return virtio_mem_remove_memory(vm, addr, size);
}
/*
* See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered
* by the big block.
*/
static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id);
return remove_memory(nid, addr, memory_block_size_bytes());
return virtio_mem_remove_memory(vm, addr, size);
}
/*
* Try to offline and remove a memory block from Linux.
* Try offlining and removing memory from Linux.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
* Will not modify the state of the memory block.
* Will not modify the state of memory blocks in virtio-mem.
*/
static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
uint64_t addr,
uint64_t size)
{
int rc;
dev_dbg(&vm->vdev->dev,
"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
rc = offline_and_remove_memory(vm->nid, addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
* We might have freed up memory we can now unplug, retry
* immediately instead of waiting.
*/
static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm,
virtio_mem_retry(vm);
} else {
dev_dbg(&vm->vdev->dev,
"offlining and removing memory failed: %d\n", rc);
}
return rc;
}
/*
* See virtio_mem_offline_and_remove_memory(): Try offlining and removing
* a single Linux memory block.
*/
static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
int nid = vm->nid;
const uint64_t size = memory_block_size_bytes();
return virtio_mem_offline_and_remove_memory(vm, addr, size);
}
if (nid == NUMA_NO_NODE)
nid = memory_add_physaddr_to_nid(addr);
/*
* See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
* all Linux memory blocks covered by the big block.
*/
static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n",
mb_id);
return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
return virtio_mem_offline_and_remove_memory(vm, addr, size);
}
/*
......@@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
* Test if a virtio-mem device overlaps with the given range. Can be called
* from (notifier) callbacks lockless.
*/
static bool virtio_mem_overlaps_range(struct virtio_mem *vm,
unsigned long start, unsigned long size)
static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
uint64_t size)
{
unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id);
unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
memory_block_size_bytes();
return start < dev_end && dev_start < start + size;
return start < vm->addr + vm->region_size && vm->addr < start + size;
}
/*
* Test if a virtio-mem device owns a memory block. Can be called from
* Test if a virtio-mem device contains a given range. Can be called from
* (notifier) callbacks lockless.
*/
static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
uint64_t size)
{
return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
return start >= vm->addr && start + size <= vm->addr + vm->region_size;
}
static int virtio_mem_notify_going_online(struct virtio_mem *vm,
static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
unsigned long mb_id)
{
switch (virtio_mem_mb_get_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
case VIRTIO_MEM_MB_STATE_OFFLINE:
switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
case VIRTIO_MEM_SBM_MB_OFFLINE:
return NOTIFY_OK;
default:
break;
......@@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm,
return NOTIFY_BAD;
}
static void virtio_mem_notify_offline(struct virtio_mem *vm,
static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
unsigned long mb_id)
{
switch (virtio_mem_mb_get_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL:
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL:
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
break;
case VIRTIO_MEM_MB_STATE_ONLINE:
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE);
case VIRTIO_MEM_SBM_MB_ONLINE:
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE);
break;
default:
BUG();
break;
}
/*
* Trigger the workqueue, maybe we can now unplug memory. Also,
* when we offline and remove a memory block, this will re-trigger
* us immediately - which is often nice because the removal of
* the memory block (e.g., memmap) might have freed up memory
* on other memory blocks we manage.
*/
virtio_mem_retry(vm);
}
static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id)
static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
unsigned long mb_id)
{
unsigned long nb_offline;
switch (virtio_mem_mb_get_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
break;
case VIRTIO_MEM_MB_STATE_OFFLINE:
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE);
case VIRTIO_MEM_SBM_MB_OFFLINE:
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE);
break;
default:
BUG();
break;
}
nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
/* see if we can add new blocks now that we onlined one block */
if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
virtio_mem_retry(vm);
}
static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
unsigned long mb_id)
{
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
struct page *page;
const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
unsigned long pfn;
int sb_id, i;
int sb_id;
for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue;
/*
* Drop our reference to the pages so the memory can get
* offlined and add the unplugged pages to the managed
* page counters (so offlining code can correctly subtract
* them again).
*/
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size);
adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(pfn + i);
if (WARN_ON(!page_ref_dec_and_test(page)))
dump_page(page, "unplugged page referenced");
}
sb_id * vm->sbm.sb_size);
virtio_mem_fake_offline_going_offline(pfn, nr_pages);
}
}
static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm,
static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
unsigned long mb_id)
{
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
unsigned long pfn;
int sb_id, i;
int sb_id;
for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue;
/*
* Get the reference we dropped when going offline and
* subtract the unplugged pages from the managed page
* counters.
*/
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size);
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
for (i = 0; i < nr_pages; i++)
page_ref_inc(pfn_to_page(pfn + i));
sb_id * vm->sbm.sb_size);
virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
}
}
static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
unsigned long bb_id,
unsigned long pfn,
unsigned long nr_pages)
{
/*
* When marked as "fake-offline", all online memory of this device block
* is allocated by us. Otherwise, we don't have any memory allocated.
*/
if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
return;
virtio_mem_fake_offline_going_offline(pfn, nr_pages);
}
static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
unsigned long bb_id,
unsigned long pfn,
unsigned long nr_pages)
{
if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
return;
virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
}
/*
* This callback will either be called synchronously from add_memory() or
* asynchronously (e.g., triggered via user space). We have to be careful
......@@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
struct memory_notify *mhp = arg;
const unsigned long start = PFN_PHYS(mhp->start_pfn);
const unsigned long size = PFN_PHYS(mhp->nr_pages);
const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
int rc = NOTIFY_OK;
unsigned long id;
if (!virtio_mem_overlaps_range(vm, start, size))
return NOTIFY_DONE;
if (vm->in_sbm) {
id = virtio_mem_phys_to_mb_id(start);
/*
* Memory is onlined/offlined in memory block granularity. We cannot
* cross virtio-mem device boundaries and memory block boundaries. Bail
* out if this ever changes.
* In SBM, we add memory in separate memory blocks - we expect
* it to be onlined/offlined in the same granularity. Bail out
* if this ever changes.
*/
if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
!IS_ALIGNED(start, memory_block_size_bytes())))
return NOTIFY_BAD;
} else {
id = virtio_mem_phys_to_bb_id(vm, start);
/*
* In BBM, we only care about onlining/offlining happening
* within a single big block, we don't care about the
* actual granularity as we don't track individual Linux
* memory blocks.
*/
if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
return NOTIFY_BAD;
}
/*
* Avoid circular locking lockdep warnings. We lock the mutex
......@@ -680,7 +980,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
break;
}
vm->hotplug_active = true;
virtio_mem_notify_going_offline(vm, mb_id);
if (vm->in_sbm)
virtio_mem_sbm_notify_going_offline(vm, id);
else
virtio_mem_bbm_notify_going_offline(vm, id,
mhp->start_pfn,
mhp->nr_pages);
break;
case MEM_GOING_ONLINE:
mutex_lock(&vm->hotplug_mutex);
......@@ -690,22 +995,51 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
break;
}
vm->hotplug_active = true;
rc = virtio_mem_notify_going_online(vm, mb_id);
if (vm->in_sbm)
rc = virtio_mem_sbm_notify_going_online(vm, id);
break;
case MEM_OFFLINE:
virtio_mem_notify_offline(vm, mb_id);
if (vm->in_sbm)
virtio_mem_sbm_notify_offline(vm, id);
atomic64_add(size, &vm->offline_size);
/*
* Trigger the workqueue. Now that we have some offline memory,
* maybe we can handle pending unplug requests.
*/
if (!unplug_online)
virtio_mem_retry(vm);
vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex);
break;
case MEM_ONLINE:
virtio_mem_notify_online(vm, mb_id);
if (vm->in_sbm)
virtio_mem_sbm_notify_online(vm, id);
atomic64_sub(size, &vm->offline_size);
/*
* Start adding more memory once we onlined half of our
* threshold. Don't trigger if it's possibly due to our actipn
* (e.g., us adding memory which gets onlined immediately from
* the core).
*/
if (!atomic_read(&vm->wq_active) &&
virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
virtio_mem_retry(vm);
vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex);
break;
case MEM_CANCEL_OFFLINE:
if (!vm->hotplug_active)
break;
virtio_mem_notify_cancel_offline(vm, mb_id);
if (vm->in_sbm)
virtio_mem_sbm_notify_cancel_offline(vm, id);
else
virtio_mem_bbm_notify_cancel_offline(vm, id,
mhp->start_pfn,
mhp->nr_pages);
vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex);
break;
......@@ -729,7 +1063,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
* (via generic_online_page()) using PageDirty().
*/
static void virtio_mem_set_fake_offline(unsigned long pfn,
unsigned int nr_pages, bool onlined)
unsigned long nr_pages, bool onlined)
{
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
......@@ -748,7 +1082,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn,
* (via generic_online_page()), clear PageDirty().
*/
static void virtio_mem_clear_fake_offline(unsigned long pfn,
unsigned int nr_pages, bool onlined)
unsigned long nr_pages, bool onlined)
{
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
......@@ -763,16 +1097,17 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
* Release a range of fake-offline pages to the buddy, effectively
* fake-onlining them.
*/
static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
{
const int order = MAX_ORDER - 1;
int i;
const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
unsigned long i;
/*
* We are always called with subblock granularity, which is at least
* aligned to MAX_ORDER - 1.
* We are always called at least with MAX_ORDER_NR_PAGES
* granularity/alignment (e.g., the way subblocks work). All pages
* inside such a block are alike.
*/
for (i = 0; i < nr_pages; i += 1 << order) {
for (i = 0; i < nr_pages; i += max_nr_pages) {
struct page *page = pfn_to_page(pfn + i);
/*
......@@ -782,42 +1117,128 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
* alike.
*/
if (PageDirty(page)) {
virtio_mem_clear_fake_offline(pfn + i, 1 << order,
virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
false);
generic_online_page(page, order);
generic_online_page(page, MAX_ORDER - 1);
} else {
virtio_mem_clear_fake_offline(pfn + i, 1 << order,
virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
true);
free_contig_range(pfn + i, 1 << order);
adjust_managed_page_count(page, 1 << order);
free_contig_range(pfn + i, max_nr_pages);
adjust_managed_page_count(page, max_nr_pages);
}
}
}
/*
* Try to allocate a range, marking pages fake-offline, effectively
* fake-offlining them.
*/
static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
{
const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) ==
ZONE_MOVABLE;
int rc, retry_count;
/*
* TODO: We want an alloc_contig_range() mode that tries to allocate
* harder (e.g., dealing with temporarily pinned pages, PCP), especially
* with ZONE_MOVABLE. So for now, retry a couple of times with
* ZONE_MOVABLE before giving up - because that zone is supposed to give
* some guarantees.
*/
for (retry_count = 0; retry_count < 5; retry_count++) {
rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
GFP_KERNEL);
if (rc == -ENOMEM)
/* whoops, out of memory */
return rc;
else if (rc && !is_movable)
break;
else if (rc)
continue;
virtio_mem_set_fake_offline(pfn, nr_pages, true);
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
return 0;
}
return -EBUSY;
}
/*
* Handle fake-offline pages when memory is going offline - such that the
* pages can be skipped by mm-core when offlining.
*/
static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
unsigned long nr_pages)
{
struct page *page;
unsigned long i;
/*
* Drop our reference to the pages so the memory can get offlined
* and add the unplugged pages to the managed page counters (so
* offlining code can correctly subtract them again).
*/
adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
/* Drop our reference to the pages so the memory can get offlined. */
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(pfn + i);
if (WARN_ON(!page_ref_dec_and_test(page)))
dump_page(page, "fake-offline page referenced");
}
}
/*
* Handle fake-offline pages when memory offlining is canceled - to undo
* what we did in virtio_mem_fake_offline_going_offline().
*/
static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
unsigned long nr_pages)
{
unsigned long i;
/*
* Get the reference we dropped when going offline and subtract the
* unplugged pages from the managed page counters.
*/
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
for (i = 0; i < nr_pages; i++)
page_ref_inc(pfn_to_page(pfn + i));
}
static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
{
const unsigned long addr = page_to_phys(page);
const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
unsigned long id, sb_id;
struct virtio_mem *vm;
int sb_id;
bool do_online;
/*
* We exploit here that subblocks have at least MAX_ORDER - 1
* size/alignment and that this callback is is called with such a
* size/alignment. So we cannot cross subblocks and therefore
* also not memory blocks.
*/
rcu_read_lock();
list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
if (!virtio_mem_owned_mb(vm, mb_id))
if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
continue;
if (vm->in_sbm) {
/*
* We exploit here that subblocks have at least
* MAX_ORDER_NR_PAGES size/alignment - so we cannot
* cross subblocks within one call.
*/
id = virtio_mem_phys_to_mb_id(addr);
sb_id = virtio_mem_phys_to_sb_id(vm, addr);
do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
sb_id, 1);
} else {
/*
* If plugged, online the pages, otherwise, set them fake
* offline (PageOffline).
* If the whole block is marked fake offline, keep
* everything that way.
*/
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
id = virtio_mem_phys_to_bb_id(vm, addr);
do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
}
if (do_online)
generic_online_page(page, order);
else
virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
......@@ -870,23 +1291,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
};
int rc = -ENOMEM;
if (atomic_read(&vm->config_changed))
return -EAGAIN;
dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK:
vm->plugged_size += size;
return 0;
case VIRTIO_MEM_RESP_NACK:
return -EAGAIN;
rc = -EAGAIN;
break;
case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY;
rc = -ETXTBSY;
break;
case VIRTIO_MEM_RESP_ERROR:
return -EINVAL;
rc = -EINVAL;
break;
default:
return -ENOMEM;
break;
}
dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
return rc;
}
static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
......@@ -898,21 +1329,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
};
int rc = -ENOMEM;
if (atomic_read(&vm->config_changed))
return -EAGAIN;
dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK:
vm->plugged_size -= size;
return 0;
case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY;
rc = -ETXTBSY;
break;
case VIRTIO_MEM_RESP_ERROR:
return -EINVAL;
rc = -EINVAL;
break;
default:
return -ENOMEM;
break;
}
dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
return rc;
}
static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
......@@ -920,6 +1360,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
const struct virtio_mem_req req = {
.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
};
int rc = -ENOMEM;
dev_dbg(&vm->vdev->dev, "unplugging all memory");
switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK:
......@@ -929,30 +1372,31 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
atomic_set(&vm->config_changed, 1);
return 0;
case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY;
rc = -ETXTBSY;
break;
default:
return -ENOMEM;
break;
}
dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
return rc;
}
/*
* Plug selected subblocks. Updates the plugged state, but not the state
* of the memory block.
*/
static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
int sb_id, int count)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size;
const uint64_t size = count * vm->subblock_size;
sb_id * vm->sbm.sb_size;
const uint64_t size = count * vm->sbm.sb_size;
int rc;
dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
sb_id, sb_id + count - 1);
rc = virtio_mem_send_plug_request(vm, addr, size);
if (!rc)
virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count);
virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
return rc;
}
......@@ -960,23 +1404,46 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
* Unplug selected subblocks. Updates the plugged state, but not the state
* of the memory block.
*/
static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
int sb_id, int count)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size;
const uint64_t size = count * vm->subblock_size;
sb_id * vm->sbm.sb_size;
const uint64_t size = count * vm->sbm.sb_size;
int rc;
dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n",
mb_id, sb_id, sb_id + count - 1);
rc = virtio_mem_send_unplug_request(vm, addr, size);
if (!rc)
virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count);
virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
return rc;
}
/*
* Request to unplug a big block.
*
* Will not modify the state of the big block.
*/
static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_send_unplug_request(vm, addr, size);
}
/*
* Request to plug a big block.
*
* Will not modify the state of the big block.
*/
static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_send_plug_request(vm, addr, size);
}
/*
* Unplug the desired number of plugged subblocks of a offline or not-added
* memory block. Will fail if any subblock cannot get unplugged (instead of
......@@ -986,29 +1453,29 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
*
* Note: can fail after some subblocks were unplugged.
*/
static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
unsigned long mb_id, uint64_t *nb_sb)
{
int sb_id, count;
int rc;
sb_id = vm->nb_sb_per_mb - 1;
sb_id = vm->sbm.sbs_per_mb - 1;
while (*nb_sb) {
/* Find the next candidate subblock */
while (sb_id >= 0 &&
virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1))
virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
sb_id--;
if (sb_id < 0)
break;
/* Try to unplug multiple subblocks at a time */
count = 1;
while (count < *nb_sb && sb_id > 0 &&
virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
count++;
sb_id--;
}
rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
if (rc)
return rc;
*nb_sb -= count;
......@@ -1025,63 +1492,50 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
*
* Note: can fail after some subblocks were unplugged.
*/
static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id)
static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
{
uint64_t nb_sb = vm->nb_sb_per_mb;
uint64_t nb_sb = vm->sbm.sbs_per_mb;
return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb);
return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
}
/*
* Prepare tracking data for the next memory block.
*/
static int virtio_mem_prepare_next_mb(struct virtio_mem *vm,
static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
unsigned long *mb_id)
{
int rc;
if (vm->next_mb_id > vm->last_usable_mb_id)
if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
return -ENOSPC;
/* Resize the state array if required. */
rc = virtio_mem_mb_state_prepare_next_mb(vm);
rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
if (rc)
return rc;
/* Resize the subblock bitmap if required. */
rc = virtio_mem_sb_bitmap_prepare_next_mb(vm);
rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
if (rc)
return rc;
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++;
*mb_id = vm->next_mb_id++;
vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
*mb_id = vm->sbm.next_mb_id++;
return 0;
}
/*
* Don't add too many blocks that are not onlined yet to avoid running OOM.
*/
static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm)
{
unsigned long nb_offline;
nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD;
}
/*
* Try to plug the desired number of subblocks and add the memory block
* to Linux.
*
* Will modify the state of the memory block.
*/
static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
unsigned long mb_id,
uint64_t *nb_sb)
static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
unsigned long mb_id, uint64_t *nb_sb)
{
const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb);
int rc, rc2;
const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
int rc;
if (WARN_ON_ONCE(!count))
return -EINVAL;
......@@ -1090,7 +1544,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
* Plug the requested number of subblocks before adding it to linux,
* so that onlining will directly online all plugged subblocks.
*/
rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count);
rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
if (rc)
return rc;
......@@ -1098,29 +1552,21 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
* Mark the block properly offline before adding it to Linux,
* so the memory notifiers will find the block in the right state.
*/
if (count == vm->nb_sb_per_mb)
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE);
if (count == vm->sbm.sbs_per_mb)
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE);
else
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
/* Add the memory block to linux - if that fails, try to unplug. */
rc = virtio_mem_mb_add(vm, mb_id);
rc = virtio_mem_sbm_add_mb(vm, mb_id);
if (rc) {
enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED;
int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
dev_err(&vm->vdev->dev,
"adding memory block %lu failed with %d\n", mb_id, rc);
rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count);
/*
* TODO: Linux MM does not properly clean up yet in all cases
* where adding of memory failed - especially on -ENOMEM.
*/
if (rc2)
new_state = VIRTIO_MEM_MB_STATE_PLUGGED;
virtio_mem_mb_set_state(vm, mb_id, new_state);
if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
return rc;
}
......@@ -1136,8 +1582,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
*
* Note: Can fail after some subblocks were successfully plugged.
*/
static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
uint64_t *nb_sb, bool online)
static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
unsigned long mb_id, uint64_t *nb_sb,
bool online)
{
unsigned long pfn, nr_pages;
int sb_id, count;
......@@ -1147,17 +1594,16 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
return -EINVAL;
while (*nb_sb) {
sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id);
if (sb_id >= vm->nb_sb_per_mb)
sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
if (sb_id >= vm->sbm.sbs_per_mb)
break;
count = 1;
while (count < *nb_sb &&
sb_id + count < vm->nb_sb_per_mb &&
!virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count,
1))
sb_id + count < vm->sbm.sbs_per_mb &&
!virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
count++;
rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count);
rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
if (rc)
return rc;
*nb_sb -= count;
......@@ -1166,29 +1612,26 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
/* fake-online the pages if the memory block is online */
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size);
nr_pages = PFN_DOWN(count * vm->subblock_size);
sb_id * vm->sbm.sb_size);
nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
virtio_mem_fake_online(pfn, nr_pages);
}
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
if (online)
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE);
else
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE);
}
return 0;
}
/*
* Try to plug the requested amount of memory.
*/
static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_sb = diff / vm->subblock_size;
uint64_t nb_sb = diff / vm->sbm.sb_size;
unsigned long mb_id;
int rc;
......@@ -1199,18 +1642,18 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
mutex_lock(&vm->hotplug_mutex);
/* Try to plug subblocks of partially plugged online blocks. */
virtio_mem_for_each_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true);
virtio_mem_sbm_for_each_mb(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true);
if (rc || !nb_sb)
goto out_unlock;
cond_resched();
}
/* Try to plug subblocks of partially plugged offline blocks. */
virtio_mem_for_each_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false);
virtio_mem_sbm_for_each_mb(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false);
if (rc || !nb_sb)
goto out_unlock;
cond_resched();
......@@ -1223,11 +1666,11 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
mutex_unlock(&vm->hotplug_mutex);
/* Try to plug and add unused blocks */
virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) {
if (virtio_mem_too_many_mb_offline(vm))
virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC;
rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
return rc;
cond_resched();
......@@ -1235,13 +1678,13 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
/* Try to prepare, plug and add new blocks */
while (nb_sb) {
if (virtio_mem_too_many_mb_offline(vm))
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC;
rc = virtio_mem_prepare_next_mb(vm, &mb_id);
rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
if (rc)
return rc;
rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc)
return rc;
cond_resched();
......@@ -1253,6 +1696,112 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
return rc;
}
/*
* Plug a big block and add it to Linux.
*
* Will modify the state of the big block.
*/
static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_UNUSED))
return -EINVAL;
rc = virtio_mem_bbm_plug_bb(vm, bb_id);
if (rc)
return rc;
virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
rc = virtio_mem_bbm_add_bb(vm, bb_id);
if (rc) {
if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
else
/* Retry from the main loop. */
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
return rc;
}
return 0;
}
/*
* Prepare tracking data for the next big block.
*/
static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
unsigned long *bb_id)
{
int rc;
if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
return -ENOSPC;
/* Resize the big block state array if required. */
rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
if (rc)
return rc;
vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
*bb_id = vm->bbm.next_bb_id;
vm->bbm.next_bb_id++;
return 0;
}
static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_bb = diff / vm->bbm.bb_size;
unsigned long bb_id;
int rc;
if (!nb_bb)
return 0;
/* Try to plug and add unused big blocks */
virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
return -ENOSPC;
rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
cond_resched();
}
/* Try to prepare, plug and add new big blocks */
while (nb_bb) {
if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
return -ENOSPC;
rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
if (rc)
return rc;
rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
if (!rc)
nb_bb--;
if (rc)
return rc;
cond_resched();
}
return 0;
}
/*
* Try to plug the requested amount of memory.
*/
static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
{
if (vm->in_sbm)
return virtio_mem_sbm_plug_request(vm, diff);
return virtio_mem_bbm_plug_request(vm, diff);
}
/*
* Unplug the desired number of plugged subblocks of an offline memory block.
* Will fail if any subblock cannot get unplugged (instead of skipping it).
......@@ -1262,33 +1811,33 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
*
* Note: Can fail after some subblocks were successfully unplugged.
*/
static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
unsigned long mb_id,
uint64_t *nb_sb)
{
int rc;
rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb);
rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb);
/* some subblocks might have been unplugged even on failure */
if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb))
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
if (rc)
return rc;
if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
/*
* Remove the block from Linux - this should never fail.
* Hinder the block from getting onlined by marking it
* unplugged. Temporarily drop the mutex, so
* any pending GOING_ONLINE requests can be serviced/rejected.
*/
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_UNUSED);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_UNUSED);
mutex_unlock(&vm->hotplug_mutex);
rc = virtio_mem_mb_remove(vm, mb_id);
rc = virtio_mem_sbm_remove_mb(vm, mb_id);
BUG_ON(rc);
mutex_lock(&vm->hotplug_mutex);
}
......@@ -1300,38 +1849,31 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
*
* Will modify the state of the memory block.
*/
static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
unsigned long mb_id, int sb_id,
int count)
{
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count;
const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
unsigned long start_pfn;
int rc;
start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size);
rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
MIGRATE_MOVABLE, GFP_KERNEL);
if (rc == -ENOMEM)
/* whoops, out of memory */
return rc;
if (rc)
return -EBUSY;
sb_id * vm->sbm.sb_size);
/* Mark it as fake-offline before unplugging it */
virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
rc = virtio_mem_fake_offline(start_pfn, nr_pages);
if (rc)
return rc;
/* Try to unplug the allocated memory */
rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
if (rc) {
/* Return the memory to the buddy. */
virtio_mem_fake_online(start_pfn, nr_pages);
return rc;
}
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
return 0;
}
......@@ -1345,34 +1887,34 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
* Note: Can fail after some subblocks were successfully unplugged. Can
* return 0 even if subblocks were busy and could not get unplugged.
*/
static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
unsigned long mb_id,
uint64_t *nb_sb)
{
int rc, sb_id;
/* If possible, try to unplug the complete block in one shot. */
if (*nb_sb >= vm->nb_sb_per_mb &&
virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0,
vm->nb_sb_per_mb);
if (*nb_sb >= vm->sbm.sbs_per_mb &&
virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
vm->sbm.sbs_per_mb);
if (!rc) {
*nb_sb -= vm->nb_sb_per_mb;
*nb_sb -= vm->sbm.sbs_per_mb;
goto unplugged;
} else if (rc != -EBUSY)
return rc;
}
/* Fallback to single subblocks. */
for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
/* Find the next candidate subblock */
while (sb_id >= 0 &&
!virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
!virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
sb_id--;
if (sb_id < 0)
break;
rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1);
rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
if (rc == -EBUSY)
continue;
else if (rc)
......@@ -1386,24 +1928,21 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
* remove it. This will usually not fail, as no memory is in use
* anymore - however some other notifiers might NACK the request.
*/
if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
mutex_unlock(&vm->hotplug_mutex);
rc = virtio_mem_mb_offline_and_remove(vm, mb_id);
rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
mutex_lock(&vm->hotplug_mutex);
if (!rc)
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_UNUSED);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_UNUSED);
}
return 0;
}
/*
* Try to unplug the requested amount of memory.
*/
static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_sb = diff / vm->subblock_size;
uint64_t nb_sb = diff / vm->sbm.sb_size;
unsigned long mb_id;
int rc;
......@@ -1418,20 +1957,17 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
mutex_lock(&vm->hotplug_mutex);
/* Try to unplug subblocks of partially plugged offline blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
&nb_sb);
virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
goto out_unlock;
cond_resched();
}
/* Try to unplug subblocks of plugged offline blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE) {
rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
&nb_sb);
virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) {
rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
goto out_unlock;
cond_resched();
......@@ -1443,10 +1979,9 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
}
/* Try to unplug subblocks of partially plugged online blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
&nb_sb);
virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
goto out_unlock;
mutex_unlock(&vm->hotplug_mutex);
......@@ -1455,10 +1990,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
}
/* Try to unplug subblocks of plugged online blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE) {
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
&nb_sb);
virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) {
rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
goto out_unlock;
mutex_unlock(&vm->hotplug_mutex);
......@@ -1473,20 +2006,212 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
return rc;
}
/*
* Try to offline and remove a big block from Linux and unplug it. Will fail
* with -EBUSY if some memory is busy and cannot get unplugged.
*
* Will modify the state of the memory block. Might temporarily drop the
* hotplug_mutex.
*/
static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
struct page *page;
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_ADDED))
return -EINVAL;
if (bbm_safe_unplug) {
/*
* Start by fake-offlining all memory. Once we marked the device
* block as fake-offline, all newly onlined memory will
* automatically be kept fake-offline. Protect from concurrent
* onlining/offlining until we have a consistent state.
*/
mutex_lock(&vm->hotplug_mutex);
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn);
if (!page)
continue;
rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
if (rc) {
end_pfn = pfn;
goto rollback_safe_unplug;
}
}
mutex_unlock(&vm->hotplug_mutex);
}
rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
if (rc) {
if (bbm_safe_unplug) {
mutex_lock(&vm->hotplug_mutex);
goto rollback_safe_unplug;
}
return rc;
}
rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
if (rc)
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
else
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
return rc;
rollback_safe_unplug:
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn);
if (!page)
continue;
virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
}
virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
mutex_unlock(&vm->hotplug_mutex);
return rc;
}
/*
* Try to remove a big block from Linux and unplug it. Will fail with
* -EBUSY if some memory is online.
*
* Will modify the state of the memory block.
*/
static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_ADDED))
return -EINVAL;
rc = virtio_mem_bbm_remove_bb(vm, bb_id);
if (rc)
return -EBUSY;
rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
if (rc)
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
else
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
return rc;
}
/*
* Test if a big block is completely offline.
*/
static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
unsigned long bb_id)
{
const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
unsigned long pfn;
for (pfn = start_pfn; pfn < start_pfn + nr_pages;
pfn += PAGES_PER_SECTION) {
if (pfn_to_online_page(pfn))
return false;
}
return true;
}
static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_bb = diff / vm->bbm.bb_size;
uint64_t bb_id;
int rc;
if (!nb_bb)
return 0;
/* Try to unplug completely offline big blocks first. */
virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
cond_resched();
/*
* As we're holding no locks, this check is racy as memory
* can get onlined in the meantime - but we'll fail gracefully.
*/
if (!virtio_mem_bbm_bb_is_offline(vm, bb_id))
continue;
rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id);
if (rc == -EBUSY)
continue;
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
}
if (!unplug_online)
return 0;
/* Try to unplug any big blocks. */
virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
cond_resched();
rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
if (rc == -EBUSY)
continue;
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
}
return nb_bb ? -EBUSY : 0;
}
/*
* Try to unplug the requested amount of memory.
*/
static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
{
if (vm->in_sbm)
return virtio_mem_sbm_unplug_request(vm, diff);
return virtio_mem_bbm_unplug_request(vm, diff);
}
/*
* Try to unplug all blocks that couldn't be unplugged before, for example,
* because the hypervisor was busy.
*/
static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
{
unsigned long mb_id;
unsigned long id;
int rc;
virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) {
rc = virtio_mem_mb_unplug(vm, mb_id);
if (!vm->in_sbm) {
virtio_mem_bbm_for_each_bb(vm, id,
VIRTIO_MEM_BBM_BB_PLUGGED) {
rc = virtio_mem_bbm_unplug_bb(vm, id);
if (rc)
return rc;
virtio_mem_bbm_set_bb_state(vm, id,
VIRTIO_MEM_BBM_BB_UNUSED);
}
return 0;
}
virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
rc = virtio_mem_sbm_unplug_mb(vm, id);
if (rc)
return rc;
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
virtio_mem_sbm_set_mb_state(vm, id,
VIRTIO_MEM_SBM_MB_UNUSED);
}
return 0;
......@@ -1511,7 +2236,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm)
usable_region_size, &usable_region_size);
end_addr = vm->addr + usable_region_size;
end_addr = min(end_addr, phys_limit);
vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1;
if (vm->in_sbm)
vm->sbm.last_usable_mb_id =
virtio_mem_phys_to_mb_id(end_addr) - 1;
else
vm->bbm.last_usable_bb_id =
virtio_mem_phys_to_bb_id(vm, end_addr) - 1;
/* see if there is a request to change the size */
virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
......@@ -1535,6 +2266,7 @@ static void virtio_mem_run_wq(struct work_struct *work)
if (vm->broken)
return;
atomic_set(&vm->wq_active, 1);
retry:
rc = 0;
......@@ -1595,6 +2327,8 @@ static void virtio_mem_run_wq(struct work_struct *work)
"unknown error, marking device broken: %d\n", rc);
vm->broken = true;
}
atomic_set(&vm->wq_active, 0);
}
static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
......@@ -1631,6 +2365,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
static int virtio_mem_init(struct virtio_mem *vm)
{
const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
uint64_t sb_size, addr;
uint16_t node_id;
if (!vm->vdev->config->get) {
......@@ -1659,15 +2394,9 @@ static int virtio_mem_init(struct virtio_mem *vm)
virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
&vm->region_size);
/*
* We always hotplug memory in memory block granularity. This way,
* we have to wait for exactly one memory block to online.
*/
if (vm->device_block_size > memory_block_size_bytes()) {
dev_err(&vm->vdev->dev,
"The block size is not supported (too big).\n");
return -EINVAL;
}
/* Determine the nid for the device based on the lowest address. */
if (vm->nid == NUMA_NO_NODE)
vm->nid = memory_add_physaddr_to_nid(vm->addr);
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
......@@ -1681,23 +2410,57 @@ static int virtio_mem_init(struct virtio_mem *vm)
"Some memory is not addressable. This can make some memory unusable.\n");
/*
* Calculate the subblock size:
* - At least MAX_ORDER - 1 / pageblock_order.
* - At least the device block size.
* In the worst case, a single subblock per memory block.
*/
vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1,
pageblock_order);
vm->subblock_size = max_t(uint64_t, vm->device_block_size,
vm->subblock_size);
vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size;
* We want subblocks to span at least MAX_ORDER_NR_PAGES and
* pageblock_nr_pages pages. This:
* - Simplifies our page onlining code (virtio_mem_online_page_cb)
* and fake page onlining code (virtio_mem_fake_online).
* - Is required for now for alloc_contig_range() to work reliably -
* it doesn't properly handle smaller granularity on ZONE_NORMAL.
*/
sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
pageblock_nr_pages) * PAGE_SIZE;
sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
if (sb_size < memory_block_size_bytes() && !force_bbm) {
/* SBM: At least two subblocks per Linux memory block. */
vm->in_sbm = true;
vm->sbm.sb_size = sb_size;
vm->sbm.sbs_per_mb = memory_block_size_bytes() /
vm->sbm.sb_size;
/* Round up to the next full memory block */
vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
addr = vm->addr + memory_block_size_bytes() - 1;
vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
vm->sbm.next_mb_id = vm->sbm.first_mb_id;
} else {
/* BBM: At least one Linux memory block. */
vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
memory_block_size_bytes());
vm->next_mb_id = vm->first_mb_id;
vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
vm->region_size) - 1;
if (bbm_block_size) {
if (!is_power_of_2(bbm_block_size)) {
dev_warn(&vm->vdev->dev,
"bbm_block_size is not a power of 2");
} else if (bbm_block_size < vm->bbm.bb_size) {
dev_warn(&vm->vdev->dev,
"bbm_block_size is too small");
} else {
vm->bbm.bb_size = bbm_block_size;
}
}
/* Round up to the next aligned big block */
addr = vm->addr + vm->bbm.bb_size - 1;
vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
vm->bbm.next_bb_id = vm->bbm.first_bb_id;
}
/* Prepare the offline threshold - make sure we can add two blocks. */
vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
/* In BBM, we also want at least two big blocks. */
vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
vm->offline_threshold);
dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
......@@ -1705,9 +2468,13 @@ static int virtio_mem_init(struct virtio_mem *vm)
(unsigned long long)vm->device_block_size);
dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
memory_block_size_bytes());
if (vm->in_sbm)
dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
(unsigned long long)vm->subblock_size);
if (vm->nid != NUMA_NO_NODE)
(unsigned long long)vm->sbm.sb_size);
else
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
(unsigned long long)vm->bbm.bb_size);
if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
return 0;
......@@ -1753,6 +2520,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm)
vm->parent_resource = NULL;
}
static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
{
return 1;
}
static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
{
const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
vm->addr + vm->region_size, NULL,
virtio_mem_range_has_system_ram) == 1;
}
static int virtio_mem_probe(struct virtio_device *vdev)
{
struct virtio_mem *vm;
......@@ -1849,21 +2630,24 @@ static void virtio_mem_remove(struct virtio_device *vdev)
cancel_work_sync(&vm->wq);
hrtimer_cancel(&vm->retry_timer);
if (vm->in_sbm) {
/*
* After we unregistered our callbacks, user space can online partially
* plugged offline blocks. Make sure to remove them.
* After we unregistered our callbacks, user space can online
* partially plugged offline blocks. Make sure to remove them.
*/
virtio_mem_for_each_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
rc = virtio_mem_mb_remove(vm, mb_id);
virtio_mem_sbm_for_each_mb(vm, mb_id,
VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
rc = virtio_mem_sbm_remove_mb(vm, mb_id);
BUG_ON(rc);
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_UNUSED);
}
/*
* After we unregistered our callbacks, user space can no longer
* offline partially plugged online memory blocks. No need to worry
* about them.
* offline partially plugged online memory blocks. No need to
* worry about them.
*/
}
/* unregister callbacks */
unregister_virtio_mem_device(vm);
......@@ -1874,10 +2658,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
* the system. And there is no way to stop the driver/device from going
* away. Warn at least.
*/
if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
if (virtio_mem_has_memory_added(vm)) {
dev_warn(&vdev->dev, "device still has system memory added\n");
} else {
virtio_mem_delete_resource(vm);
......@@ -1885,8 +2666,12 @@ static void virtio_mem_remove(struct virtio_device *vdev)
}
/* remove all tracking data - no locking needed */
vfree(vm->mb_state);
vfree(vm->sb_bitmap);
if (vm->in_sbm) {
vfree(vm->sbm.mb_states);
vfree(vm->sbm.sb_states);
} else {
vfree(vm->bbm.bb_states);
}
/* reset the device and cleanup the queues */
vdev->config->reset(vdev);
......
......@@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->num_added = 0;
vq->packed_ring = true;
vq->use_dma_api = vring_use_dma_api(vdev);
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
vq->in_use = false;
vq->last_add_time_valid = false;
......@@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
cpu_to_le16(vq->packed.event_flags_shadow);
}
list_add_tail(&vq->vq.list, &vdev->vqs);
return &vq->vq;
err_desc_extra:
......@@ -1676,9 +1676,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
err_desc_state:
kfree(vq);
err_vq:
vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr);
vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr);
err_device:
vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr);
vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr);
err_driver:
vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr);
err_ring:
......@@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->last_used_idx = 0;
vq->num_added = 0;
vq->use_dma_api = vring_use_dma_api(vdev);
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
vq->in_use = false;
vq->last_add_time_valid = false;
......@@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
memset(vq->split.desc_state, 0, vring.num *
sizeof(struct vring_desc_state_split));
list_add_tail(&vq->vq.list, &vdev->vqs);
return &vq->vq;
}
EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
......
......@@ -42,6 +42,7 @@ struct vdpa_vq_state {
* @config: the configuration ops for this device.
* @index: device index
* @features_valid: were features initialized? for legacy guests
* @nvqs: maximum number of supported virtqueues
*/
struct vdpa_device {
struct device dev;
......
......@@ -34,15 +34,21 @@
#define VIRTIO_ID_CONSOLE 3 /* virtio console */
#define VIRTIO_ID_RNG 4 /* virtio rng */
#define VIRTIO_ID_BALLOON 5 /* virtio balloon */
#define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */
#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */
#define VIRTIO_ID_SCSI 8 /* virtio scsi */
#define VIRTIO_ID_9P 9 /* 9p virtio console */
#define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */
#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
#define VIRTIO_ID_CAIF 12 /* Virtio caif */
#define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */
#define VIRTIO_ID_GPU 16 /* virtio GPU */
#define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */
#define VIRTIO_ID_INPUT 18 /* virtio input */
#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */
#define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */
#define VIRTIO_ID_PSTORE 22 /* virtio pstore device */
#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */
#define VIRTIO_ID_MEM 24 /* virtio mem */
#define VIRTIO_ID_FS 26 /* virtio filesystem */
......
......@@ -1784,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(remove_memory);
static int try_offline_memory_block(struct memory_block *mem, void *arg)
{
uint8_t online_type = MMOP_ONLINE_KERNEL;
uint8_t **online_types = arg;
struct page *page;
int rc;
/*
* Sense the online_type via the zone of the memory block. Offlining
* with multiple zones within one memory block will be rejected
* by offlining code ... so we don't care about that.
*/
page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
online_type = MMOP_ONLINE_MOVABLE;
rc = device_offline(&mem->dev);
/*
* Default is MMOP_OFFLINE - change it only if offlining succeeded,
* so try_reonline_memory_block() can do the right thing.
*/
if (!rc)
**online_types = online_type;
(*online_types)++;
/* Ignore if already offline. */
return rc < 0 ? rc : 0;
}
static int try_reonline_memory_block(struct memory_block *mem, void *arg)
{
uint8_t **online_types = arg;
int rc;
if (**online_types != MMOP_OFFLINE) {
mem->online_type = **online_types;
rc = device_online(&mem->dev);
if (rc < 0)
pr_warn("%s: Failed to re-online memory: %d",
__func__, rc);
}
/* Continue processing all remaining memory blocks. */
(*online_types)++;
return 0;
}
/*
* Try to offline and remove a memory block. Might take a long time to
* finish in case memory is still in use. Primarily useful for memory devices
* that logically unplugged all memory (so it's no longer in use) and want to
* offline + remove the memory block.
* Try to offline and remove memory. Might take a long time to finish in case
* memory is still in use. Primarily useful for memory devices that logically
* unplugged all memory (so it's no longer in use) and want to offline + remove
* that memory.
*/
int offline_and_remove_memory(int nid, u64 start, u64 size)
{
struct memory_block *mem;
int rc = -EINVAL;
const unsigned long mb_count = size / memory_block_size_bytes();
uint8_t *online_types, *tmp;
int rc;
if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
size != memory_block_size_bytes())
return rc;
!IS_ALIGNED(size, memory_block_size_bytes()) || !size)
return -EINVAL;
/*
* We'll remember the old online type of each memory block, so we can
* try to revert whatever we did when offlining one memory block fails
* after offlining some others succeeded.
*/
online_types = kmalloc_array(mb_count, sizeof(*online_types),
GFP_KERNEL);
if (!online_types)
return -ENOMEM;
/*
* Initialize all states to MMOP_OFFLINE, so when we abort processing in
* try_offline_memory_block(), we'll skip all unprocessed blocks in
* try_reonline_memory_block().
*/
memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug();
mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
if (mem)
rc = device_offline(&mem->dev);
/* Ignore if the device is already offline. */
if (rc > 0)
rc = 0;
tmp = online_types;
rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
/*
* In case we succeeded to offline the memory block, remove it.
* In case we succeeded to offline all memory, remove it.
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
rc = try_remove_memory(nid, start, size);
WARN_ON_ONCE(rc);
if (rc)
pr_err("%s: Failed to remove memory: %d", __func__, rc);
}
/*
* Rollback what we did. While memory onlining might theoretically fail
* (nacked by a notifier), it barely ever happens.
*/
if (rc) {
tmp = online_types;
walk_memory_blocks(start, size, &tmp,
try_reonline_memory_block);
}
unlock_device_hotplug();
kfree(online_types);
return rc;
}
EXPORT_SYMBOL_GPL(offline_and_remove_memory);
......
......@@ -16,6 +16,16 @@
# define mb() abort()
# define dma_rmb() abort()
# define dma_wmb() abort()
#elif defined(__aarch64__)
#define dmb(opt) asm volatile("dmb " #opt : : : "memory")
#define virt_mb() __sync_synchronize()
#define virt_rmb() dmb(ishld)
#define virt_wmb() dmb(ishst)
#define virt_store_mb(var, value) do { WRITE_ONCE(var, value); dmb(ish); } while (0)
/* Weak barriers should be used. If not - it's a bug */
# define mb() abort()
# define dma_rmb() abort()
# define dma_wmb() abort()
#else
#error Please fill in barrier macros
#endif
......
......@@ -2,6 +2,8 @@
#ifndef BUG_H
#define BUG_H
#include <asm/bug.h>
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
#define BUILD_BUG_ON(x)
......
......@@ -11,6 +11,7 @@
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/overflow.h>
#include <linux/list.h>
#include <linux/printk.h>
#include <linux/bug.h>
......@@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr)
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp)
{
size_t bytes;
if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
return NULL;
return krealloc(p, bytes, gfp);
}
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
......@@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr)
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0)
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment