devmap.c 20.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
 */

/* Devmaps primary use is as a backend map for XDP BPF helper call
 * bpf_redirect_map(). Because XDP is mostly concerned with performance we
 * spent some effort to ensure the datapath with redirect maps does not use
 * any locking. This is a quick note on the details.
 *
 * We have three possible paths to get into the devmap control plane bpf
 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
 * will invoke an update, delete, or lookup operation. To ensure updates and
 * deletes appear atomic from the datapath side xchg() is used to modify the
 * netdev_map array. Then because the datapath does a lookup into the netdev_map
 * array (read-only) from an RCU critical section we use call_rcu() to wait for
 * an rcu grace period before free'ing the old data structures. This ensures the
 * datapath always has a valid copy. However, the datapath does a "flush"
 * operation that pushes any pending packets in the driver outside the RCU
 * critical section. Each bpf_dtab_netdev tracks these pending operations using
20 21
 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
 * this list is empty, indicating outstanding flush operations have completed.
22 23 24 25 26 27
 *
 * BPF syscalls may race with BPF program calls on any of the update, delete
 * or lookup operations. As noted above the xchg() operation also keep the
 * netdev_map consistent in this case. From the devmap side BPF programs
 * calling into these operations are the same as multiple user space threads
 * making system calls.
28 29 30 31 32 33
 *
 * Finally, any of the above may race with a netdev_unregister notifier. The
 * unregister notifier must search for net devices in the map structure that
 * contain a reference to the net device and remove them. This is a two step
 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
 * check to see if the ifindex is the same as the net_device being removed.
34 35 36 37 38 39
 * When removing the dev a cmpxchg() is used to ensure the correct dev is
 * removed, in the case of a concurrent update or delete operation it is
 * possible that the initially referenced dev is no longer in the map. As the
 * notifier hook walks the map we know that new dev references can not be
 * added by the user because core infrastructure ensures dev_get_by_index()
 * calls will fail at this point.
40 41 42 43 44 45
 *
 * The devmap_hash type is a map type which interprets keys as ifindexes and
 * indexes these using a hashmap. This allows maps that use ifindex as key to be
 * densely packed instead of having holes in the lookup array for unused
 * ifindexes. The setup and packet enqueue/send code is shared between the two
 * types of devmap; only the lookup and insertion is different.
46 47
 */
#include <linux/bpf.h>
48
#include <net/xdp.h>
49
#include <linux/filter.h>
50
#include <trace/events/xdp.h>
51

52 53 54
#define DEV_CREATE_FLAG_MASK \
	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)

55
#define DEV_MAP_BULK_SIZE 16
56
struct xdp_dev_bulk_queue {
57
	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
58
	struct list_head flush_node;
59
	struct net_device *dev;
60
	struct net_device *dev_rx;
61 62 63
	unsigned int count;
};

64
struct bpf_dtab_netdev {
65
	struct net_device *dev; /* must be first member, due to tracepoint */
66
	struct hlist_node index_hlist;
67
	struct bpf_dtab *dtab;
68
	struct rcu_head rcu;
69
	unsigned int idx;
70 71 72 73
};

struct bpf_dtab {
	struct bpf_map map;
74
	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
75
	struct list_head list;
76 77 78 79 80 81

	/* these are only used for DEVMAP_HASH type maps */
	struct hlist_head *dev_index_head;
	spinlock_t index_lock;
	unsigned int items;
	u32 n_buckets;
82 83
};

84
static DEFINE_PER_CPU(struct list_head, dev_flush_list);
85
static DEFINE_SPINLOCK(dev_map_lock);
86 87
static LIST_HEAD(dev_map_list);

88 89 90 91 92 93 94 95 96 97 98 99 100
static struct hlist_head *dev_map_create_hash(unsigned int entries)
{
	int i;
	struct hlist_head *hash;

	hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
	if (hash != NULL)
		for (i = 0; i < entries; i++)
			INIT_HLIST_HEAD(&hash[i]);

	return hash;
}

101 102 103 104 105 106
static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
						    int idx)
{
	return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}

107
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
108
{
109 110
	u64 cost = 0;
	int err;
111 112 113

	/* check sanity of attributes */
	if (attr->max_entries == 0 || attr->key_size != 4 ||
114
	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
115
		return -EINVAL;
116

117 118 119 120 121
	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
	 * verifier prevents writes from the BPF side
	 */
	attr->map_flags |= BPF_F_RDONLY_PROG;

122

123
	bpf_map_init_from_attr(&dtab->map, attr);
124

125 126 127 128 129
	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
		dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);

		if (!dtab->n_buckets) /* Overflow check */
			return -EINVAL;
130
		cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
131 132
	} else {
		cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
133 134
	}

135
	/* if map size is larger than memlock limit, reject it */
136
	err = bpf_map_charge_init(&dtab->map.memory, cost);
137
	if (err)
138
		return -EINVAL;
139

140 141 142
	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
		dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
		if (!dtab->dev_index_head)
143
			goto free_charge;
144 145

		spin_lock_init(&dtab->index_lock);
146 147 148 149 150
	} else {
		dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
						      sizeof(struct bpf_dtab_netdev *),
						      dtab->map.numa_node);
		if (!dtab->netdev_map)
151
			goto free_charge;
152 153
	}

154
	return 0;
155

156 157
free_charge:
	bpf_map_charge_finish(&dtab->map.memory);
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
	return -ENOMEM;
}

static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
	struct bpf_dtab *dtab;
	int err;

	if (!capable(CAP_NET_ADMIN))
		return ERR_PTR(-EPERM);

	dtab = kzalloc(sizeof(*dtab), GFP_USER);
	if (!dtab)
		return ERR_PTR(-ENOMEM);

	err = dev_map_init_map(dtab, attr);
	if (err) {
		kfree(dtab);
		return ERR_PTR(err);
	}

	spin_lock(&dev_map_lock);
	list_add_tail_rcu(&dtab->list, &dev_map_list);
	spin_unlock(&dev_map_lock);

	return &dtab->map;
184 185 186 187 188
}

static void dev_map_free(struct bpf_map *map)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
189
	int i;
190 191 192

	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
	 * so the programs (can be more than one that used this map) were
193 194 195 196 197 198
	 * disconnected from events. The following synchronize_rcu() guarantees
	 * both rcu read critical sections complete and waits for
	 * preempt-disable regions (NAPI being the relevant context here) so we
	 * are certain there will be no further reads against the netdev_map and
	 * all flush operations are complete. Flush operations can only be done
	 * from NAPI context for this reason.
199
	 */
200 201 202 203 204

	spin_lock(&dev_map_lock);
	list_del_rcu(&dtab->list);
	spin_unlock(&dev_map_lock);

205
	bpf_clear_redirect_map(map);
206 207
	synchronize_rcu();

208 209 210
	/* Make sure prior __dev_map_entry_free() have completed. */
	rcu_barrier();

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
	if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
		for (i = 0; i < dtab->n_buckets; i++) {
			struct bpf_dtab_netdev *dev;
			struct hlist_head *head;
			struct hlist_node *next;

			head = dev_map_index_hash(dtab, i);

			hlist_for_each_entry_safe(dev, next, head, index_hlist) {
				hlist_del_rcu(&dev->index_hlist);
				dev_put(dev->dev);
				kfree(dev);
			}
		}

		kfree(dtab->dev_index_head);
	} else {
		for (i = 0; i < dtab->map.max_entries; i++) {
			struct bpf_dtab_netdev *dev;

			dev = dtab->netdev_map[i];
			if (!dev)
				continue;

			dev_put(dev->dev);
			kfree(dev);
		}

		bpf_map_area_free(dtab->netdev_map);
240 241 242 243 244 245 246 247 248
	}

	kfree(dtab);
}

static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	u32 index = key ? *(u32 *)key : U32_MAX;
249
	u32 *next = next_key;
250 251 252 253 254 255 256 257 258 259 260 261

	if (index >= dtab->map.max_entries) {
		*next = 0;
		return 0;
	}

	if (index == dtab->map.max_entries - 1)
		return -ENOENT;
	*next = index + 1;
	return 0;
}

262 263 264 265 266 267
struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct hlist_head *head = dev_map_index_hash(dtab, key);
	struct bpf_dtab_netdev *dev;

268 269
	hlist_for_each_entry_rcu(dev, head, index_hlist,
				 lockdep_is_held(&dtab->index_lock))
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
		if (dev->idx == key)
			return dev;

	return NULL;
}

static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
				    void *next_key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	u32 idx, *next = next_key;
	struct bpf_dtab_netdev *dev, *next_dev;
	struct hlist_head *head;
	int i = 0;

	if (!key)
		goto find_first;

	idx = *(u32 *)key;

	dev = __dev_map_hash_lookup_elem(map, idx);
	if (!dev)
		goto find_first;

	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
				    struct bpf_dtab_netdev, index_hlist);

	if (next_dev) {
		*next = next_dev->idx;
		return 0;
	}

	i = idx & (dtab->n_buckets - 1);
	i++;

 find_first:
	for (; i < dtab->n_buckets; i++) {
		head = dev_map_index_hash(dtab, i);

		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
					    struct bpf_dtab_netdev,
					    index_hlist);
		if (next_dev) {
			*next = next_dev->idx;
			return 0;
		}
	}

	return -ENOENT;
}

321
static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
322
{
323
	struct net_device *dev = bq->dev;
324
	int sent = 0, drops = 0, err = 0;
325 326 327 328 329 330 331 332 333 334 335
	int i;

	if (unlikely(!bq->count))
		return 0;

	for (i = 0; i < bq->count; i++) {
		struct xdp_frame *xdpf = bq->q[i];

		prefetch(xdpf);
	}

336
	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
337
	if (sent < 0) {
338
		err = sent;
339 340
		sent = 0;
		goto error;
341
	}
342 343
	drops = bq->count - sent;
out:
344 345
	bq->count = 0;

346
	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
347
	bq->dev_rx = NULL;
348
	__list_del_clearprev(&bq->flush_node);
349
	return 0;
350 351 352 353 354 355 356
error:
	/* If ndo_xdp_xmit fails with an errno, no frames have been
	 * xmit'ed and it's our responsibility to them free all.
	 */
	for (i = 0; i < bq->count; i++) {
		struct xdp_frame *xdpf = bq->q[i];

357
		xdp_return_frame_rx_napi(xdpf);
358 359 360
		drops++;
	}
	goto out;
361 362
}

363
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
364 365 366
 * from the driver before returning from its napi->poll() routine. The poll()
 * routine is called either from busy_poll context or net_rx_action signaled
 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
367 368
 * net device can be torn down. On devmap tear down we ensure the flush list
 * is empty before completing to ensure all flush operations have completed.
369 370 371
 * When drivers update the bpf program they may need to ensure any flush ops
 * are also complete. Using synchronize_rcu or call_rcu will suffice for this
 * because both wait for napi context to exit.
372
 */
373
void __dev_flush(void)
374
{
375
	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
376
	struct xdp_dev_bulk_queue *bq, *tmp;
377

378
	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
379
		bq_xmit_all(bq, XDP_XMIT_FLUSH);
380 381
}

382 383 384 385
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
 * update happens in parallel here a dev_put wont happen until after reading the
 * ifindex.
 */
386
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
387 388
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
389
	struct bpf_dtab_netdev *obj;
390

391
	if (key >= map->max_entries)
392 393
		return NULL;

394 395 396 397
	obj = READ_ONCE(dtab->netdev_map[key]);
	return obj;
}

398 399 400
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
 * Thus, safe percpu variable access.
 */
401
static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
402
		      struct net_device *dev_rx)
403
{
404
	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
405
	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
406 407

	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
408
		bq_xmit_all(bq, 0);
409

410 411 412 413 414 415 416
	/* Ingress dev_rx will be the same for all xdp_frame's in
	 * bulk_queue, because bq stored per-CPU and must be flushed
	 * from net_device drivers NAPI func end.
	 */
	if (!bq->dev_rx)
		bq->dev_rx = dev_rx;

417
	bq->q[bq->count++] = xdpf;
418 419 420 421

	if (!bq->flush_node.prev)
		list_add(&bq->flush_node, flush_list);

422 423 424
	return 0;
}

425 426
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
			       struct net_device *dev_rx)
427 428
{
	struct xdp_frame *xdpf;
429
	int err;
430 431 432 433

	if (!dev->netdev_ops->ndo_xdp_xmit)
		return -EOPNOTSUPP;

434 435 436 437
	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
	if (unlikely(err))
		return err;

438 439 440 441
	xdpf = convert_to_xdp_frame(xdp);
	if (unlikely(!xdpf))
		return -EOVERFLOW;

442
	return bq_enqueue(dev, xdpf, dev_rx);
443 444
}

445 446 447 448 449 450 451 452 453 454 455 456 457 458
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
		    struct net_device *dev_rx)
{
	return __xdp_enqueue(dev, xdp, dev_rx);
}

int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
		    struct net_device *dev_rx)
{
	struct net_device *dev = dst->dev;

	return __xdp_enqueue(dev, xdp, dev_rx);
}

459 460 461 462 463
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
			     struct bpf_prog *xdp_prog)
{
	int err;

464
	err = xdp_ok_fwd_dev(dst->dev, skb->len);
465 466 467 468 469 470 471 472
	if (unlikely(err))
		return err;
	skb->dev = dst->dev;
	generic_xdp_tx(skb, xdp_prog);

	return 0;
}

473 474
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
475
	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
476
	struct net_device *dev = obj ? obj->dev : NULL;
477 478 479 480

	return dev ? &dev->ifindex : NULL;
}

481 482 483 484 485 486 487 488 489
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
								*(u32 *)key);
	struct net_device *dev = obj ? obj->dev : NULL;

	return dev ? &dev->ifindex : NULL;
}

490 491
static void __dev_map_entry_free(struct rcu_head *rcu)
{
492
	struct bpf_dtab_netdev *dev;
493

494 495 496
	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
	dev_put(dev->dev);
	kfree(dev);
497 498 499 500 501 502 503 504 505 506 507
}

static int dev_map_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct bpf_dtab_netdev *old_dev;
	int k = *(u32 *)key;

	if (k >= map->max_entries)
		return -EINVAL;

508
	/* Use call_rcu() here to ensure any rcu critical sections have
509 510 511 512 513
	 * completed as well as any flush operations because call_rcu
	 * will wait for preempt-disable region to complete, NAPI in this
	 * context.  And additionally, the driver tear down ensures all
	 * soft irqs are complete before removing the net device in the
	 * case of dev_put equals zero.
514 515 516 517 518 519 520
	 */
	old_dev = xchg(&dtab->netdev_map[k], NULL);
	if (old_dev)
		call_rcu(&old_dev->rcu, __dev_map_entry_free);
	return 0;
}

521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct bpf_dtab_netdev *old_dev;
	int k = *(u32 *)key;
	unsigned long flags;
	int ret = -ENOENT;

	spin_lock_irqsave(&dtab->index_lock, flags);

	old_dev = __dev_map_hash_lookup_elem(map, k);
	if (old_dev) {
		dtab->items--;
		hlist_del_init_rcu(&old_dev->index_hlist);
		call_rcu(&old_dev->rcu, __dev_map_entry_free);
		ret = 0;
	}
	spin_unlock_irqrestore(&dtab->index_lock, flags);

	return ret;
}

543 544 545 546
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
						    struct bpf_dtab *dtab,
						    u32 ifindex,
						    unsigned int idx)
547
{
548 549
	struct bpf_dtab_netdev *dev;

550 551
	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
			   dtab->map.numa_node);
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
	if (!dev)
		return ERR_PTR(-ENOMEM);

	dev->dev = dev_get_by_index(net, ifindex);
	if (!dev->dev) {
		kfree(dev);
		return ERR_PTR(-EINVAL);
	}

	dev->idx = idx;
	dev->dtab = dtab;

	return dev;
}

static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
				 void *key, void *value, u64 map_flags)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
571 572
	struct bpf_dtab_netdev *dev, *old_dev;
	u32 ifindex = *(u32 *)value;
573
	u32 i = *(u32 *)key;
574 575 576 577 578 579 580 581 582 583 584

	if (unlikely(map_flags > BPF_EXIST))
		return -EINVAL;
	if (unlikely(i >= dtab->map.max_entries))
		return -E2BIG;
	if (unlikely(map_flags == BPF_NOEXIST))
		return -EEXIST;

	if (!ifindex) {
		dev = NULL;
	} else {
585 586 587
		dev = __dev_map_alloc_node(net, dtab, ifindex, i);
		if (IS_ERR(dev))
			return PTR_ERR(dev);
588 589 590 591 592 593 594 595 596 597 598 599 600
	}

	/* Use call_rcu() here to ensure rcu critical sections have completed
	 * Remembering the driver side flush operation will happen before the
	 * net device is removed.
	 */
	old_dev = xchg(&dtab->netdev_map[i], dev);
	if (old_dev)
		call_rcu(&old_dev->rcu, __dev_map_entry_free);

	return 0;
}

601 602 603 604 605 606 607
static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
			       u64 map_flags)
{
	return __dev_map_update_elem(current->nsproxy->net_ns,
				     map, key, value, map_flags);
}

608 609 610 611 612 613 614 615
static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
				     void *key, void *value, u64 map_flags)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct bpf_dtab_netdev *dev, *old_dev;
	u32 ifindex = *(u32 *)value;
	u32 idx = *(u32 *)key;
	unsigned long flags;
616
	int err = -EEXIST;
617 618 619 620

	if (unlikely(map_flags > BPF_EXIST || !ifindex))
		return -EINVAL;

621 622
	spin_lock_irqsave(&dtab->index_lock, flags);

623 624
	old_dev = __dev_map_hash_lookup_elem(map, idx);
	if (old_dev && (map_flags & BPF_NOEXIST))
625
		goto out_err;
626 627

	dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
628 629 630 631
	if (IS_ERR(dev)) {
		err = PTR_ERR(dev);
		goto out_err;
	}
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651

	if (old_dev) {
		hlist_del_rcu(&old_dev->index_hlist);
	} else {
		if (dtab->items >= dtab->map.max_entries) {
			spin_unlock_irqrestore(&dtab->index_lock, flags);
			call_rcu(&dev->rcu, __dev_map_entry_free);
			return -E2BIG;
		}
		dtab->items++;
	}

	hlist_add_head_rcu(&dev->index_hlist,
			   dev_map_index_hash(dtab, idx));
	spin_unlock_irqrestore(&dtab->index_lock, flags);

	if (old_dev)
		call_rcu(&old_dev->rcu, __dev_map_entry_free);

	return 0;
652 653 654 655

out_err:
	spin_unlock_irqrestore(&dtab->index_lock, flags);
	return err;
656 657 658 659 660 661 662 663 664
}

static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
				   u64 map_flags)
{
	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
					 map, key, value, map_flags);
}

665 666 667 668 669 670 671
const struct bpf_map_ops dev_map_ops = {
	.map_alloc = dev_map_alloc,
	.map_free = dev_map_free,
	.map_get_next_key = dev_map_get_next_key,
	.map_lookup_elem = dev_map_lookup_elem,
	.map_update_elem = dev_map_update_elem,
	.map_delete_elem = dev_map_delete_elem,
672
	.map_check_btf = map_check_no_btf,
673
};
674

675 676 677 678 679 680 681 682 683 684
const struct bpf_map_ops dev_map_hash_ops = {
	.map_alloc = dev_map_alloc,
	.map_free = dev_map_free,
	.map_get_next_key = dev_map_hash_get_next_key,
	.map_lookup_elem = dev_map_hash_lookup_elem,
	.map_update_elem = dev_map_hash_update_elem,
	.map_delete_elem = dev_map_hash_delete_elem,
	.map_check_btf = map_check_no_btf,
};

685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
				       struct net_device *netdev)
{
	unsigned long flags;
	u32 i;

	spin_lock_irqsave(&dtab->index_lock, flags);
	for (i = 0; i < dtab->n_buckets; i++) {
		struct bpf_dtab_netdev *dev;
		struct hlist_head *head;
		struct hlist_node *next;

		head = dev_map_index_hash(dtab, i);

		hlist_for_each_entry_safe(dev, next, head, index_hlist) {
			if (netdev != dev->dev)
				continue;

			dtab->items--;
			hlist_del_rcu(&dev->index_hlist);
			call_rcu(&dev->rcu, __dev_map_entry_free);
		}
	}
	spin_unlock_irqrestore(&dtab->index_lock, flags);
}

711 712 713 714 715
static int dev_map_notification(struct notifier_block *notifier,
				ulong event, void *ptr)
{
	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
	struct bpf_dtab *dtab;
716
	int i, cpu;
717 718

	switch (event) {
719 720 721 722 723 724 725 726 727 728 729 730 731 732
	case NETDEV_REGISTER:
		if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
			break;

		/* will be freed in free_netdev() */
		netdev->xdp_bulkq =
			__alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
					   sizeof(void *), GFP_ATOMIC);
		if (!netdev->xdp_bulkq)
			return NOTIFY_BAD;

		for_each_possible_cpu(cpu)
			per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
		break;
733
	case NETDEV_UNREGISTER:
734 735 736 737 738 739 740
		/* This rcu_read_lock/unlock pair is needed because
		 * dev_map_list is an RCU list AND to ensure a delete
		 * operation does not free a netdev_map entry while we
		 * are comparing it against the netdev being unregistered.
		 */
		rcu_read_lock();
		list_for_each_entry_rcu(dtab, &dev_map_list, list) {
741 742 743 744 745
			if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
				dev_map_hash_remove_netdev(dtab, netdev);
				continue;
			}

746
			for (i = 0; i < dtab->map.max_entries; i++) {
747
				struct bpf_dtab_netdev *dev, *odev;
748

749
				dev = READ_ONCE(dtab->netdev_map[i]);
750
				if (!dev || netdev != dev->dev)
751
					continue;
752 753
				odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
				if (dev == odev)
754 755 756 757
					call_rcu(&dev->rcu,
						 __dev_map_entry_free);
			}
		}
758
		rcu_read_unlock();
759 760 761 762 763 764 765 766 767 768 769 770 771
		break;
	default:
		break;
	}
	return NOTIFY_OK;
}

static struct notifier_block dev_map_notifier = {
	.notifier_call = dev_map_notification,
};

static int __init dev_map_init(void)
{
772 773
	int cpu;

774 775 776
	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
		     offsetof(struct _bpf_dtab_netdev, dev));
777
	register_netdevice_notifier(&dev_map_notifier);
778 779

	for_each_possible_cpu(cpu)
780
		INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
781 782 783 784
	return 0;
}

subsys_initcall(dev_map_init);