Commit 02df55d2 authored by Arnd Bergmann's avatar Arnd Bergmann Committed by David S. Miller

macvtap: rework object lifetime rules

This reworks the change done by the previous patch
in a more complete way.

The original macvtap code has a number of problems
resulting from the use of RCU for protecting the
access to struct macvtap_queue from open files.

This includes
- need for GFP_ATOMIC allocations for skbs
- potential deadlocks when copy_*_user sleeps
- inability to work with vhost-net

Changing the lifetime of macvtap_queue to always
depend on the open file solves all these. The
RCU reference simply moves one step down to
the reference on the macvlan_dev, which we
only need for nonblocking operations.
Signed-off-by: default avatarArnd Bergmann <arnd@arndb.de>
Acked-by: default avatarSridhar Samudrala <sri@us.ibm.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 37ee3d5b
...@@ -60,30 +60,19 @@ static struct cdev macvtap_cdev; ...@@ -60,30 +60,19 @@ static struct cdev macvtap_cdev;
/* /*
* RCU usage: * RCU usage:
* The macvtap_queue is referenced both from the chardev struct file * The macvtap_queue and the macvlan_dev are loosely coupled, the
* and from the struct macvlan_dev using rcu_read_lock. * pointers from one to the other can only be read while rcu_read_lock
* or macvtap_lock is held.
* *
* We never actually update the contents of a macvtap_queue atomically * Both the file and the macvlan_dev hold a reference on the macvtap_queue
* with RCU but it is used for race-free destruction of a queue when * through sock_hold(&q->sk). When the macvlan_dev goes away first,
* either the file or the macvlan_dev goes away. Pointers back to * q->vlan becomes inaccessible. When the files gets closed,
* the dev and the file are implicitly valid as long as the queue * macvtap_get_queue() fails.
* exists.
* *
* The callbacks from macvlan are always done with rcu_read_lock held * There may still be references to the struct sock inside of the
* already. For calls from file_operations, we use the rcu_read_lock_bh * queue from outbound SKBs, but these never reference back to the
* to get a reference count on the socket and the device. * file or the dev. The data structure is freed through __sk_free
* * when both our references and any pending SKBs are gone.
* When destroying a queue, we remove the pointers from the file and
* from the dev and then synchronize_rcu to make sure no thread is
* still using the queue. There may still be references to the struct
* sock inside of the queue from outbound SKBs, but these never
* reference back to the file or the dev. The data structure is freed
* through __sk_free when both our references and any pending SKBs
* are gone.
*
* macvtap_lock is only used to prevent multiple concurrent open()
* calls to assign a new vlan->tap pointer. It could be moved into
* the macvlan_dev itself but is extremely rarely used.
*/ */
static DEFINE_SPINLOCK(macvtap_lock); static DEFINE_SPINLOCK(macvtap_lock);
...@@ -101,11 +90,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file, ...@@ -101,11 +90,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
goto out; goto out;
err = 0; err = 0;
q->vlan = vlan; rcu_assign_pointer(q->vlan, vlan);
rcu_assign_pointer(vlan->tap, q); rcu_assign_pointer(vlan->tap, q);
sock_hold(&q->sk);
q->file = file; q->file = file;
rcu_assign_pointer(file->private_data, q); file->private_data = q;
out: out:
spin_unlock(&macvtap_lock); spin_unlock(&macvtap_lock);
...@@ -113,28 +103,25 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file, ...@@ -113,28 +103,25 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
} }
/* /*
* We must destroy each queue exactly once, when either * The file owning the queue got closed, give up both
* the netdev or the file go away. * the reference that the files holds as well as the
* one from the macvlan_dev if that still exists.
* *
* Using the spinlock makes sure that we don't get * Using the spinlock makes sure that we don't get
* to the queue again after destroying it. * to the queue again after destroying it.
*
* synchronize_rcu serializes with the packet flow
* that uses rcu_read_lock.
*/ */
static void macvtap_del_queue(struct macvtap_queue **qp) static void macvtap_put_queue(struct macvtap_queue *q)
{ {
struct macvtap_queue *q; struct macvlan_dev *vlan;
spin_lock(&macvtap_lock); spin_lock(&macvtap_lock);
q = rcu_dereference(*qp); vlan = rcu_dereference(q->vlan);
if (!q) { if (vlan) {
spin_unlock(&macvtap_lock); rcu_assign_pointer(vlan->tap, NULL);
return; rcu_assign_pointer(q->vlan, NULL);
sock_put(&q->sk);
} }
rcu_assign_pointer(q->vlan->tap, NULL);
rcu_assign_pointer(q->file->private_data, NULL);
spin_unlock(&macvtap_lock); spin_unlock(&macvtap_lock);
synchronize_rcu(); synchronize_rcu();
...@@ -152,29 +139,29 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, ...@@ -152,29 +139,29 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
return rcu_dereference(vlan->tap); return rcu_dereference(vlan->tap);
} }
/*
* The net_device is going away, give up the reference
* that it holds on the queue (all the queues one day)
* and safely set the pointer from the queues to NULL.
*/
static void macvtap_del_queues(struct net_device *dev) static void macvtap_del_queues(struct net_device *dev)
{ {
struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_dev *vlan = netdev_priv(dev);
macvtap_del_queue(&vlan->tap);
}
static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
{
struct macvtap_queue *q; struct macvtap_queue *q;
rcu_read_lock_bh();
q = rcu_dereference(file->private_data); spin_lock(&macvtap_lock);
if (q) { q = rcu_dereference(vlan->tap);
sock_hold(&q->sk); if (!q) {
dev_hold(q->vlan->dev); spin_unlock(&macvtap_lock);
return;
} }
rcu_read_unlock_bh();
return q;
}
static inline void macvtap_file_put_queue(struct macvtap_queue *q) rcu_assign_pointer(vlan->tap, NULL);
{ rcu_assign_pointer(q->vlan, NULL);
spin_unlock(&macvtap_lock);
synchronize_rcu();
sock_put(&q->sk); sock_put(&q->sk);
dev_put(q->vlan->dev);
} }
/* /*
...@@ -284,7 +271,6 @@ static int macvtap_open(struct inode *inode, struct file *file) ...@@ -284,7 +271,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
q->sock.type = SOCK_RAW; q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED; q->sock.state = SS_CONNECTED;
sock_init_data(&q->sock, &q->sk); sock_init_data(&q->sock, &q->sk);
q->sk.sk_allocation = GFP_ATOMIC; /* for now */
q->sk.sk_write_space = macvtap_sock_write_space; q->sk.sk_write_space = macvtap_sock_write_space;
err = macvtap_set_queue(dev, file, q); err = macvtap_set_queue(dev, file, q);
...@@ -300,13 +286,14 @@ static int macvtap_open(struct inode *inode, struct file *file) ...@@ -300,13 +286,14 @@ static int macvtap_open(struct inode *inode, struct file *file)
static int macvtap_release(struct inode *inode, struct file *file) static int macvtap_release(struct inode *inode, struct file *file)
{ {
macvtap_del_queue((struct macvtap_queue **)&file->private_data); struct macvtap_queue *q = file->private_data;
macvtap_put_queue(q);
return 0; return 0;
} }
static unsigned int macvtap_poll(struct file *file, poll_table * wait) static unsigned int macvtap_poll(struct file *file, poll_table * wait)
{ {
struct macvtap_queue *q = macvtap_file_get_queue(file); struct macvtap_queue *q = file->private_data;
unsigned int mask = POLLERR; unsigned int mask = POLLERR;
if (!q) if (!q)
...@@ -323,7 +310,6 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait) ...@@ -323,7 +310,6 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
sock_writeable(&q->sk))) sock_writeable(&q->sk)))
mask |= POLLOUT | POLLWRNORM; mask |= POLLOUT | POLLWRNORM;
macvtap_file_put_queue(q);
out: out:
return mask; return mask;
} }
...@@ -334,6 +320,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, ...@@ -334,6 +320,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
int noblock) int noblock)
{ {
struct sk_buff *skb; struct sk_buff *skb;
struct macvlan_dev *vlan;
size_t len = count; size_t len = count;
int err; int err;
...@@ -341,26 +328,37 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, ...@@ -341,26 +328,37 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
return -EINVAL; return -EINVAL;
skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
if (!skb)
if (!skb) { goto err;
macvlan_count_rx(q->vlan, 0, false, false);
return err;
}
skb_reserve(skb, NET_IP_ALIGN); skb_reserve(skb, NET_IP_ALIGN);
skb_put(skb, count); skb_put(skb, count);
if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len);
macvlan_count_rx(q->vlan, 0, false, false); if (err)
kfree_skb(skb); goto err;
return -EFAULT;
}
skb_set_network_header(skb, ETH_HLEN); skb_set_network_header(skb, ETH_HLEN);
rcu_read_lock_bh();
macvlan_start_xmit(skb, q->vlan->dev); vlan = rcu_dereference(q->vlan);
if (vlan)
macvlan_start_xmit(skb, vlan->dev);
else
kfree_skb(skb);
rcu_read_unlock_bh();
return count; return count;
err:
rcu_read_lock_bh();
vlan = rcu_dereference(q->vlan);
if (vlan)
macvlan_count_rx(q->vlan, 0, false, false);
rcu_read_unlock_bh();
kfree_skb(skb);
return err;
} }
static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
...@@ -368,15 +366,10 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, ...@@ -368,15 +366,10 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
ssize_t result = -ENOLINK; ssize_t result = -ENOLINK;
struct macvtap_queue *q = macvtap_file_get_queue(file); struct macvtap_queue *q = file->private_data;
if (!q)
goto out;
result = macvtap_get_user(q, iv, iov_length(iv, count), result = macvtap_get_user(q, iv, iov_length(iv, count),
file->f_flags & O_NONBLOCK); file->f_flags & O_NONBLOCK);
macvtap_file_put_queue(q);
out:
return result; return result;
} }
...@@ -385,14 +378,17 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, ...@@ -385,14 +378,17 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
const struct sk_buff *skb, const struct sk_buff *skb,
const struct iovec *iv, int len) const struct iovec *iv, int len)
{ {
struct macvlan_dev *vlan = q->vlan; struct macvlan_dev *vlan;
int ret; int ret;
len = min_t(int, skb->len, len); len = min_t(int, skb->len, len);
ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
rcu_read_lock_bh();
vlan = rcu_dereference(q->vlan);
macvlan_count_rx(vlan, len, ret == 0, 0); macvlan_count_rx(vlan, len, ret == 0, 0);
rcu_read_unlock_bh();
return ret ? ret : len; return ret ? ret : len;
} }
...@@ -401,14 +397,16 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, ...@@ -401,14 +397,16 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos) unsigned long count, loff_t pos)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct macvtap_queue *q = macvtap_file_get_queue(file); struct macvtap_queue *q = file->private_data;
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb; struct sk_buff *skb;
ssize_t len, ret = 0; ssize_t len, ret = 0;
if (!q) if (!q) {
return -ENOLINK; ret = -ENOLINK;
goto out;
}
len = iov_length(iv, count); len = iov_length(iv, count);
if (len < 0) { if (len < 0) {
...@@ -444,7 +442,6 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, ...@@ -444,7 +442,6 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
remove_wait_queue(q->sk.sk_sleep, &wait); remove_wait_queue(q->sk.sk_sleep, &wait);
out: out:
macvtap_file_put_queue(q);
return ret; return ret;
} }
...@@ -454,12 +451,13 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, ...@@ -454,12 +451,13 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
static long macvtap_ioctl(struct file *file, unsigned int cmd, static long macvtap_ioctl(struct file *file, unsigned int cmd,
unsigned long arg) unsigned long arg)
{ {
struct macvtap_queue *q; struct macvtap_queue *q = file->private_data;
struct macvlan_dev *vlan;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
struct ifreq __user *ifr = argp; struct ifreq __user *ifr = argp;
unsigned int __user *up = argp; unsigned int __user *up = argp;
unsigned int u; unsigned int u;
char devname[IFNAMSIZ]; int ret;
switch (cmd) { switch (cmd) {
case TUNSETIFF: case TUNSETIFF:
...@@ -471,16 +469,21 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, ...@@ -471,16 +469,21 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
return 0; return 0;
case TUNGETIFF: case TUNGETIFF:
q = macvtap_file_get_queue(file); rcu_read_lock_bh();
if (!q) vlan = rcu_dereference(q->vlan);
if (vlan)
dev_hold(vlan->dev);
rcu_read_unlock_bh();
if (!vlan)
return -ENOLINK; return -ENOLINK;
memcpy(devname, q->vlan->dev->name, sizeof(devname));
macvtap_file_put_queue(q);
ret = 0;
if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
return -EFAULT; ret = -EFAULT;
return 0; dev_put(vlan->dev);
return ret;
case TUNGETFEATURES: case TUNGETFEATURES:
if (put_user((IFF_TAP | IFF_NO_PI), up)) if (put_user((IFF_TAP | IFF_NO_PI), up))
...@@ -491,11 +494,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, ...@@ -491,11 +494,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
if (get_user(u, up)) if (get_user(u, up))
return -EFAULT; return -EFAULT;
q = macvtap_file_get_queue(file);
if (!q)
return -ENOLINK;
q->sk.sk_sndbuf = u; q->sk.sk_sndbuf = u;
macvtap_file_put_queue(q);
return 0; return 0;
case TUNSETOFFLOAD: case TUNSETOFFLOAD:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment