sch_api.c 51.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * net/sched/sch_api.c	Packet scheduler API.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Fixes:
 *
 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmod.h>
#include <linux/list.h>
29
#include <linux/hrtimer.h>
30
#include <linux/lockdep.h>
31
#include <linux/slab.h>
32
#include <linux/hashtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
33

34
#include <net/net_namespace.h>
35
#include <net/sock.h>
36
#include <net/netlink.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
#include <net/pkt_sched.h>
38
#include <net/pkt_cls.h>
Linus Torvalds's avatar
Linus Torvalds committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

/*

   Short review.
   -------------

   This file consists of two interrelated parts:

   1. queueing disciplines manager frontend.
   2. traffic classes manager frontend.

   Generally, queueing discipline ("qdisc") is a black box,
   which is able to enqueue packets and to dequeue them (when
   device is ready to send something) in order and at times
   determined by algorithm hidden in it.

   qdisc's are divided to two categories:
   - "queues", which have no internal structure visible from outside.
   - "schedulers", which split all the packets to "traffic classes",
     using "packet classifiers" (look at cls_api.c)

   In turn, classes may have child qdiscs (as rule, queues)
   attached to them etc. etc. etc.

   The goal of the routines in this file is to translate
   information supplied by user in the form of handles
   to more intelligible for kernel form, to make some sanity
   checks and part of work, which is common to all qdiscs
   and to provide rtnetlink notifications.

   All real intelligent work is done inside qdisc modules.



   Every discipline has two major routines: enqueue and dequeue.

   ---dequeue

   dequeue usually returns a skb to send. It is allowed to return NULL,
   but it does not mean that queue is empty, it just means that
   discipline does not want to send anything this time.
   Queue is really empty if q->q.qlen == 0.
   For complicated disciplines with multiple queues q->q is not
   real packet queue, but however q->q.qlen must be valid.

   ---enqueue

   enqueue returns 0, if packet was enqueued successfully.
   If packet (this one or another one) was dropped, it returns
   not zero error code.
   NET_XMIT_DROP 	- this packet dropped
     Expected action: do not backoff, but wait until queue will clear.
   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
     Expected action: backoff or ignore

   Auxiliary routines:

96 97 98 99
   ---peek

   like dequeue but without removing a packet from the queue

Linus Torvalds's avatar
Linus Torvalds committed
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
   ---reset

   returns qdisc to initial state: purge all buffers, clear all
   timers, counters (except for statistics) etc.

   ---init

   initializes newly created qdisc.

   ---destroy

   destroys resources allocated by init and during lifetime of qdisc.

   ---change

   changes qdisc parameters.
 */

/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(qdisc_mod_lock);


/************************************************
 *	Queueing disciplines manipulation.	*
 ************************************************/


/* The list of all installed queueing disciplines. */

static struct Qdisc_ops *qdisc_base;

131
/* Register/unregister queueing discipline */
Linus Torvalds's avatar
Linus Torvalds committed
132 133 134 135 136 137 138 139 140 141 142 143 144

int register_qdisc(struct Qdisc_ops *qops)
{
	struct Qdisc_ops *q, **qp;
	int rc = -EEXIST;

	write_lock(&qdisc_mod_lock);
	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
		if (!strcmp(qops->id, q->id))
			goto out;

	if (qops->enqueue == NULL)
		qops->enqueue = noop_qdisc_ops.enqueue;
145
	if (qops->peek == NULL) {
146
		if (qops->dequeue == NULL)
147
			qops->peek = noop_qdisc_ops.peek;
148 149
		else
			goto out_einval;
150
	}
Linus Torvalds's avatar
Linus Torvalds committed
151 152 153
	if (qops->dequeue == NULL)
		qops->dequeue = noop_qdisc_ops.dequeue;

154 155 156
	if (qops->cl_ops) {
		const struct Qdisc_class_ops *cops = qops->cl_ops;

157
		if (!(cops->find && cops->walk && cops->leaf))
158 159
			goto out_einval;

160
		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 162 163
			goto out_einval;
	}

Linus Torvalds's avatar
Linus Torvalds committed
164 165 166 167 168 169
	qops->next = NULL;
	*qp = qops;
	rc = 0;
out:
	write_unlock(&qdisc_mod_lock);
	return rc;
170 171 172 173

out_einval:
	rc = -EINVAL;
	goto out;
Linus Torvalds's avatar
Linus Torvalds committed
174
}
175
EXPORT_SYMBOL(register_qdisc);
Linus Torvalds's avatar
Linus Torvalds committed
176 177 178 179 180 181 182

int unregister_qdisc(struct Qdisc_ops *qops)
{
	struct Qdisc_ops *q, **qp;
	int err = -ENOENT;

	write_lock(&qdisc_mod_lock);
Eric Dumazet's avatar
Eric Dumazet committed
183
	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
Linus Torvalds's avatar
Linus Torvalds committed
184 185 186 187 188 189 190 191 192 193
		if (q == qops)
			break;
	if (q) {
		*qp = q->next;
		q->next = NULL;
		err = 0;
	}
	write_unlock(&qdisc_mod_lock);
	return err;
}
194
EXPORT_SYMBOL(unregister_qdisc);
Linus Torvalds's avatar
Linus Torvalds committed
195

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
/* Get default qdisc if not otherwise specified */
void qdisc_get_default(char *name, size_t len)
{
	read_lock(&qdisc_mod_lock);
	strlcpy(name, default_qdisc_ops->id, len);
	read_unlock(&qdisc_mod_lock);
}

static struct Qdisc_ops *qdisc_lookup_default(const char *name)
{
	struct Qdisc_ops *q = NULL;

	for (q = qdisc_base; q; q = q->next) {
		if (!strcmp(name, q->id)) {
			if (!try_module_get(q->owner))
				q = NULL;
			break;
		}
	}

	return q;
}

/* Set new default qdisc to use */
int qdisc_set_default(const char *name)
{
	const struct Qdisc_ops *ops;

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	write_lock(&qdisc_mod_lock);
	ops = qdisc_lookup_default(name);
	if (!ops) {
		/* Not found, drop lock and try to load module */
		write_unlock(&qdisc_mod_lock);
		request_module("sch_%s", name);
		write_lock(&qdisc_mod_lock);

		ops = qdisc_lookup_default(name);
	}

	if (ops) {
		/* Set new default */
		module_put(default_qdisc_ops->owner);
		default_qdisc_ops = ops;
	}
	write_unlock(&qdisc_mod_lock);

	return ops ? 0 : -ENOENT;
}

248 249 250 251 252 253 254 255 256
#ifdef CONFIG_NET_SCH_DEFAULT
/* Set default value from kernel config */
static int __init sch_default_qdisc(void)
{
	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
}
late_initcall(sch_default_qdisc);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
257
/* We know handle. Find qdisc among all qdisc's attached to device
258 259
 * (root qdisc, all its children, children of children etc.)
 * Note: caller either uses rtnl or rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
260 261
 */

Hannes Eder's avatar
Hannes Eder committed
262
static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 264 265
{
	struct Qdisc *q;

266 267 268
	if (!qdisc_dev(root))
		return (root->handle == handle ? root : NULL);

269 270 271 272
	if (!(root->flags & TCQ_F_BUILTIN) &&
	    root->handle == handle)
		return root;

273
	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 275 276 277 278 279
		if (q->handle == handle)
			return q;
	}
	return NULL;
}

280
void qdisc_hash_add(struct Qdisc *q, bool invisible)
281
{
282
	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283
		ASSERT_RTNL();
284
		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 286
		if (invisible)
			q->flags |= TCQ_F_INVISIBLE;
287
	}
288
}
289
EXPORT_SYMBOL(qdisc_hash_add);
290

291
void qdisc_hash_del(struct Qdisc *q)
292
{
293 294
	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
		ASSERT_RTNL();
295
		hash_del_rcu(&q->hash);
296
	}
297
}
298
EXPORT_SYMBOL(qdisc_hash_del);
299

300
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
Linus Torvalds's avatar
Linus Torvalds committed
301
{
302 303
	struct Qdisc *q;

304 305
	if (!handle)
		return NULL;
306 307 308
	q = qdisc_match_from_root(dev->qdisc, handle);
	if (q)
		goto out;
309

310 311 312 313
	if (dev_ingress_queue(dev))
		q = qdisc_match_from_root(
			dev_ingress_queue(dev)->qdisc_sleeping,
			handle);
314
out:
315
	return q;
Linus Torvalds's avatar
Linus Torvalds committed
316 317 318 319 320 321
}

static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
	unsigned long cl;
	struct Qdisc *leaf;
322
	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
Linus Torvalds's avatar
Linus Torvalds committed
323 324 325

	if (cops == NULL)
		return NULL;
326
	cl = cops->find(p, classid);
Linus Torvalds's avatar
Linus Torvalds committed
327 328 329 330 331 332 333 334 335

	if (cl == 0)
		return NULL;
	leaf = cops->leaf(p, cl);
	return leaf;
}

/* Find queueing discipline by name */

336
static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
Linus Torvalds's avatar
Linus Torvalds committed
337 338 339 340 341 342
{
	struct Qdisc_ops *q = NULL;

	if (kind) {
		read_lock(&qdisc_mod_lock);
		for (q = qdisc_base; q; q = q->next) {
343
			if (nla_strcmp(kind, q->id) == 0) {
Linus Torvalds's avatar
Linus Torvalds committed
344 345 346 347 348 349 350 351 352 353
				if (!try_module_get(q->owner))
					q = NULL;
				break;
			}
		}
		read_unlock(&qdisc_mod_lock);
	}
	return q;
}

354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/* The linklayer setting were not transferred from iproute2, in older
 * versions, and the rate tables lookup systems have been dropped in
 * the kernel. To keep backward compatible with older iproute2 tc
 * utils, we detect the linklayer setting by detecting if the rate
 * table were modified.
 *
 * For linklayer ATM table entries, the rate table will be aligned to
 * 48 bytes, thus some table entries will contain the same value.  The
 * mpu (min packet unit) is also encoded into the old rate table, thus
 * starting from the mpu, we find low and high table entries for
 * mapping this cell.  If these entries contain the same value, when
 * the rate tables have been modified for linklayer ATM.
 *
 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 * and then roundup to the next cell, calc the table entry one below,
 * and compare.
 */
static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
{
	int low       = roundup(r->mpu, 48);
	int high      = roundup(low+1, 48);
	int cell_low  = low >> r->cell_log;
	int cell_high = (high >> r->cell_log) - 1;

	/* rtab is too inaccurate at rates > 100Mbit/s */
	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
		pr_debug("TC linklayer: Giving up ATM detection\n");
		return TC_LINKLAYER_ETHERNET;
	}

	if ((cell_high > cell_low) && (cell_high < 256)
	    && (rtab[cell_low] == rtab[cell_high])) {
		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
			 cell_low, cell_high, rtab[cell_high]);
		return TC_LINKLAYER_ATM;
	}
	return TC_LINKLAYER_ETHERNET;
}

Linus Torvalds's avatar
Linus Torvalds committed
393 394
static struct qdisc_rate_table *qdisc_rtab_list;

395
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 397
					struct nlattr *tab,
					struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
398 399 400
{
	struct qdisc_rate_table *rtab;

401
	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 403
	    nla_len(tab) != TC_RTAB_SIZE) {
		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404
		return NULL;
405
	}
406

Linus Torvalds's avatar
Linus Torvalds committed
407
	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 409
		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
Linus Torvalds's avatar
Linus Torvalds committed
410 411 412 413 414 415 416 417 418
			rtab->refcnt++;
			return rtab;
		}
	}

	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
	if (rtab) {
		rtab->rate = *r;
		rtab->refcnt = 1;
419
		memcpy(rtab->data, nla_data(tab), 1024);
420 421
		if (r->linklayer == TC_LINKLAYER_UNAWARE)
			r->linklayer = __detect_linklayer(r, rtab->data);
Linus Torvalds's avatar
Linus Torvalds committed
422 423
		rtab->next = qdisc_rtab_list;
		qdisc_rtab_list = rtab;
424 425
	} else {
		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
Linus Torvalds's avatar
Linus Torvalds committed
426 427 428
	}
	return rtab;
}
429
EXPORT_SYMBOL(qdisc_get_rtab);
Linus Torvalds's avatar
Linus Torvalds committed
430 431 432 433 434 435 436 437

void qdisc_put_rtab(struct qdisc_rate_table *tab)
{
	struct qdisc_rate_table *rtab, **rtabp;

	if (!tab || --tab->refcnt)
		return;

Eric Dumazet's avatar
Eric Dumazet committed
438 439 440
	for (rtabp = &qdisc_rtab_list;
	     (rtab = *rtabp) != NULL;
	     rtabp = &rtab->next) {
Linus Torvalds's avatar
Linus Torvalds committed
441 442 443 444 445 446 447
		if (rtab == tab) {
			*rtabp = rtab->next;
			kfree(rtab);
			return;
		}
	}
}
448
EXPORT_SYMBOL(qdisc_put_rtab);
Linus Torvalds's avatar
Linus Torvalds committed
449

450 451 452 453 454 455 456
static LIST_HEAD(qdisc_stab_list);

static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
	[TCA_STAB_DATA] = { .type = NLA_BINARY },
};

457 458
static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
					       struct netlink_ext_ack *extack)
459 460 461 462 463 464 465 466
{
	struct nlattr *tb[TCA_STAB_MAX + 1];
	struct qdisc_size_table *stab;
	struct tc_sizespec *s;
	unsigned int tsize = 0;
	u16 *tab = NULL;
	int err;

467
	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 469
	if (err < 0)
		return ERR_PTR(err);
470 471
	if (!tb[TCA_STAB_BASE]) {
		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472
		return ERR_PTR(-EINVAL);
473
	}
474 475 476 477

	s = nla_data(tb[TCA_STAB_BASE]);

	if (s->tsize > 0) {
478 479
		if (!tb[TCA_STAB_DATA]) {
			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480
			return ERR_PTR(-EINVAL);
481
		}
482 483 484 485
		tab = nla_data(tb[TCA_STAB_DATA]);
		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
	}

486 487
	if (tsize != s->tsize || (!tab && tsize > 0)) {
		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488
		return ERR_PTR(-EINVAL);
489
	}
490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513

	list_for_each_entry(stab, &qdisc_stab_list, list) {
		if (memcmp(&stab->szopts, s, sizeof(*s)))
			continue;
		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
			continue;
		stab->refcnt++;
		return stab;
	}

	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
	if (!stab)
		return ERR_PTR(-ENOMEM);

	stab->refcnt = 1;
	stab->szopts = *s;
	if (tsize > 0)
		memcpy(stab->data, tab, tsize * sizeof(u16));

	list_add_tail(&stab->list, &qdisc_stab_list);

	return stab;
}

514 515 516 517 518
static void stab_kfree_rcu(struct rcu_head *head)
{
	kfree(container_of(head, struct qdisc_size_table, rcu));
}

519 520 521 522 523 524 525
void qdisc_put_stab(struct qdisc_size_table *tab)
{
	if (!tab)
		return;

	if (--tab->refcnt == 0) {
		list_del(&tab->list);
526
		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 528 529 530 531 532 533 534 535
	}
}
EXPORT_SYMBOL(qdisc_put_stab);

static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
{
	struct nlattr *nest;

	nest = nla_nest_start(skb, TCA_STAB);
536 537
	if (nest == NULL)
		goto nla_put_failure;
538 539
	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
		goto nla_put_failure;
540 541 542 543 544 545 546 547
	nla_nest_end(skb, nest);

	return skb->len;

nla_put_failure:
	return -1;
}

548 549
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
			       const struct qdisc_size_table *stab)
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
{
	int pkt_len, slot;

	pkt_len = skb->len + stab->szopts.overhead;
	if (unlikely(!stab->szopts.tsize))
		goto out;

	slot = pkt_len + stab->szopts.cell_align;
	if (unlikely(slot < 0))
		slot = 0;

	slot >>= stab->szopts.cell_log;
	if (likely(slot < stab->szopts.tsize))
		pkt_len = stab->data[slot];
	else
		pkt_len = stab->data[stab->szopts.tsize - 1] *
				(slot / stab->szopts.tsize) +
				stab->data[slot % stab->szopts.tsize];

	pkt_len <<= stab->szopts.size_log;
out:
	if (unlikely(pkt_len < 1))
		pkt_len = 1;
	qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
575
EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576

577
void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 579
{
	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
Eric Dumazet's avatar
Eric Dumazet committed
580 581
		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
			txt, qdisc->ops->id, qdisc->handle >> 16);
582 583 584 585 586
		qdisc->flags |= TCQ_F_WARN_NONWC;
	}
}
EXPORT_SYMBOL(qdisc_warn_nonwc);

587 588 589
static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
{
	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590
						 timer);
591

592
	rcu_read_lock();
593
	__netif_schedule(qdisc_root(wd->qdisc));
594
	rcu_read_unlock();
595

596 597 598
	return HRTIMER_NORESTART;
}

599 600
void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
				 clockid_t clockid)
601
{
602
	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603
	wd->timer.function = qdisc_watchdog;
604 605
	wd->qdisc = qdisc;
}
606 607 608 609 610 611
EXPORT_SYMBOL(qdisc_watchdog_init_clockid);

void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
{
	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
}
612 613
EXPORT_SYMBOL(qdisc_watchdog_init);

614
void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615
{
616 617 618 619
	if (test_bit(__QDISC_STATE_DEACTIVATED,
		     &qdisc_root_sleeping(wd->qdisc)->state))
		return;

620 621 622 623
	if (wd->last_expires == expires)
		return;

	wd->last_expires = expires;
624
	hrtimer_start(&wd->timer,
625
		      ns_to_ktime(expires),
Eric Dumazet's avatar
Eric Dumazet committed
626
		      HRTIMER_MODE_ABS_PINNED);
627
}
628
EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629 630 631

void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
632
	hrtimer_cancel(&wd->timer);
633 634
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
Linus Torvalds's avatar
Linus Torvalds committed
635

636
static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 638
{
	struct hlist_head *h;
639
	unsigned int i;
640

641
	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642 643 644 645 646 647 648 649 650 651 652

	if (h != NULL) {
		for (i = 0; i < n; i++)
			INIT_HLIST_HEAD(&h[i]);
	}
	return h;
}

void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
{
	struct Qdisc_class_common *cl;
653
	struct hlist_node *next;
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
	struct hlist_head *nhash, *ohash;
	unsigned int nsize, nmask, osize;
	unsigned int i, h;

	/* Rehash when load factor exceeds 0.75 */
	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
		return;
	nsize = clhash->hashsize * 2;
	nmask = nsize - 1;
	nhash = qdisc_class_hash_alloc(nsize);
	if (nhash == NULL)
		return;

	ohash = clhash->hash;
	osize = clhash->hashsize;

	sch_tree_lock(sch);
	for (i = 0; i < osize; i++) {
672
		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673 674 675 676 677 678 679 680 681
			h = qdisc_class_hash(cl->classid, nmask);
			hlist_add_head(&cl->hnode, &nhash[h]);
		}
	}
	clhash->hash     = nhash;
	clhash->hashsize = nsize;
	clhash->hashmask = nmask;
	sch_tree_unlock(sch);

682
	kvfree(ohash);
683 684 685 686 687 688 689 690
}
EXPORT_SYMBOL(qdisc_class_hash_grow);

int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
{
	unsigned int size = 4;

	clhash->hash = qdisc_class_hash_alloc(size);
691
	if (!clhash->hash)
692 693 694 695 696 697 698 699 700 701
		return -ENOMEM;
	clhash->hashsize  = size;
	clhash->hashmask  = size - 1;
	clhash->hashelems = 0;
	return 0;
}
EXPORT_SYMBOL(qdisc_class_hash_init);

void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
{
702
	kvfree(clhash->hash);
703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
}
EXPORT_SYMBOL(qdisc_class_hash_destroy);

void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
			     struct Qdisc_class_common *cl)
{
	unsigned int h;

	INIT_HLIST_NODE(&cl->hnode);
	h = qdisc_class_hash(cl->classid, clhash->hashmask);
	hlist_add_head(&cl->hnode, &clhash->hash[h]);
	clhash->hashelems++;
}
EXPORT_SYMBOL(qdisc_class_hash_insert);

void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
			     struct Qdisc_class_common *cl)
{
	hlist_del(&cl->hnode);
	clhash->hashelems--;
}
EXPORT_SYMBOL(qdisc_class_hash_remove);

726 727 728
/* Allocate an unique handle from space managed by kernel
 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 */
Linus Torvalds's avatar
Linus Torvalds committed
729 730
static u32 qdisc_alloc_handle(struct net_device *dev)
{
731
	int i = 0x8000;
Linus Torvalds's avatar
Linus Torvalds committed
732 733 734 735 736 737
	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);

	do {
		autohandle += TC_H_MAKE(0x10000U, 0);
		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
			autohandle = TC_H_MAKE(0x80000000U, 0);
738 739 740 741
		if (!qdisc_lookup(dev, autohandle))
			return autohandle;
		cond_resched();
	} while	(--i > 0);
Linus Torvalds's avatar
Linus Torvalds committed
742

743
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
744 745
}

746 747
void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
			       unsigned int len)
748
{
749
	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750
	const struct Qdisc_class_ops *cops;
751 752
	unsigned long cl;
	u32 parentid;
753
	bool notify;
754
	int drops;
755

756
	if (n == 0 && len == 0)
757
		return;
758
	drops = max_t(int, n, 0);
759
	rcu_read_lock();
760
	while ((parentid = sch->parent)) {
761
		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762
			break;
763

764 765
		if (sch->flags & TCQ_F_NOPARENT)
			break;
766 767 768 769 770
		/* Notify parent qdisc only if child qdisc becomes empty.
		 *
		 * If child was empty even before update then backlog
		 * counter is screwed and we skip notification because
		 * parent class is already passive.
771 772 773
		 *
		 * If the original child was offloaded then it is allowed
		 * to be seem as empty, so the parent is notified anyway.
774
		 */
775 776
		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
						       !qdisc_is_offloaded);
777
		/* TODO: perform the search on a per txq basis */
778
		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779
		if (sch == NULL) {
780 781
			WARN_ON_ONCE(parentid != TC_H_ROOT);
			break;
782
		}
783
		cops = sch->ops->cl_ops;
784
		if (notify && cops->qlen_notify) {
785
			cl = cops->find(sch, parentid);
786 787 788
			cops->qlen_notify(sch, cl);
		}
		sch->q.qlen -= n;
789
		sch->qstats.backlog -= len;
790
		__qdisc_qstats_drop(sch, drops);
791
	}
792
	rcu_read_unlock();
793
}
794
EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
Linus Torvalds's avatar
Linus Torvalds committed
795

796 797 798 799 800 801 802 803 804 805
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
			 u32 portid, u32 seq, u16 flags, int event)
{
	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
	struct tcmsg *tcm;
	struct nlmsghdr  *nlh;
	unsigned char *b = skb_tail_pointer(skb);
	struct gnet_dump d;
	struct qdisc_size_table *stab;
806
	u32 block_index;
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
	__u32 qlen;

	cond_resched();
	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
	if (!nlh)
		goto out_nlmsg_trim;
	tcm = nlmsg_data(nlh);
	tcm->tcm_family = AF_UNSPEC;
	tcm->tcm__pad1 = 0;
	tcm->tcm__pad2 = 0;
	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
	tcm->tcm_parent = clid;
	tcm->tcm_handle = q->handle;
	tcm->tcm_info = refcount_read(&q->refcnt);
	if (nla_put_string(skb, TCA_KIND, q->ops->id))
		goto nla_put_failure;
823 824 825 826 827 828 829 830 831 832 833 834
	if (q->ops->ingress_block_get) {
		block_index = q->ops->ingress_block_get(q);
		if (block_index &&
		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
			goto nla_put_failure;
	}
	if (q->ops->egress_block_get) {
		block_index = q->ops->egress_block_get(q);
		if (block_index &&
		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
			goto nla_put_failure;
	}
835 836
	if (q->ops->dump && q->ops->dump(q, skb) < 0)
		goto nla_put_failure;
837 838
	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
		goto nla_put_failure;
839
	qlen = qdisc_qlen_sum(q);
840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915

	stab = rtnl_dereference(q->stab);
	if (stab && qdisc_dump_stab(skb, stab) < 0)
		goto nla_put_failure;

	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
					 NULL, &d, TCA_PAD) < 0)
		goto nla_put_failure;

	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
		goto nla_put_failure;

	if (qdisc_is_percpu_stats(q)) {
		cpu_bstats = q->cpu_bstats;
		cpu_qstats = q->cpu_qstats;
	}

	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
				  &d, cpu_bstats, &q->bstats) < 0 ||
	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
		goto nla_put_failure;

	if (gnet_stats_finish_copy(&d) < 0)
		goto nla_put_failure;

	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
	return skb->len;

out_nlmsg_trim:
nla_put_failure:
	nlmsg_trim(skb, b);
	return -1;
}

static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
{
	if (q->flags & TCQ_F_BUILTIN)
		return true;
	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
		return true;

	return false;
}

static int qdisc_notify(struct net *net, struct sk_buff *oskb,
			struct nlmsghdr *n, u32 clid,
			struct Qdisc *old, struct Qdisc *new)
{
	struct sk_buff *skb;
	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;

	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
	if (!skb)
		return -ENOBUFS;

	if (old && !tc_qdisc_dump_ignore(old, false)) {
		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
				  0, RTM_DELQDISC) < 0)
			goto err_out;
	}
	if (new && !tc_qdisc_dump_ignore(new, false)) {
		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
			goto err_out;
	}

	if (skb->len)
		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
				      n->nlmsg_flags & NLM_F_ECHO);

err_out:
	kfree_skb(skb);
	return -EINVAL;
}

916 917
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
			       struct nlmsghdr *n, u32 clid,
918 919 920
			       struct Qdisc *old, struct Qdisc *new)
{
	if (new || old)
921
		qdisc_notify(net, skb, n, clid, old, new);
Linus Torvalds's avatar
Linus Torvalds committed
922

923
	if (old)
924 925 926 927 928 929 930 931 932 933
		qdisc_destroy(old);
}

/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 * to device "dev".
 *
 * When appropriate send a netlink notification using 'skb'
 * and "n".
 *
 * On success, destroy old qdisc.
Linus Torvalds's avatar
Linus Torvalds committed
934 935 936
 */

static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937
		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938 939
		       struct Qdisc *new, struct Qdisc *old,
		       struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
940
{
941
	struct Qdisc *q = old;
942
	struct net *net = dev_net(dev);
Linus Torvalds's avatar
Linus Torvalds committed
943 944
	int err = 0;

945
	if (parent == NULL) {
946 947 948 949
		unsigned int i, num_q, ingress;

		ingress = 0;
		num_q = dev->num_tx_queues;
950 951
		if ((q && q->flags & TCQ_F_INGRESS) ||
		    (new && new->flags & TCQ_F_INGRESS)) {
952 953
			num_q = 1;
			ingress = 1;
954 955
			if (!dev_ingress_queue(dev)) {
				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956
				return -ENOENT;
957
			}
958 959 960 961 962
		}

		if (dev->flags & IFF_UP)
			dev_deactivate(dev);

963 964
		if (new && new->ops->attach)
			goto skip;
965

966
		for (i = 0; i < num_q; i++) {
967
			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968 969 970 971

			if (!ingress)
				dev_queue = netdev_get_tx_queue(dev, i);

972 973
			old = dev_graft_qdisc(dev_queue, new);
			if (new && i > 0)
974
				qdisc_refcount_inc(new);
975

976 977
			if (!ingress)
				qdisc_destroy(old);
Linus Torvalds's avatar
Linus Torvalds committed
978
		}
979

980
skip:
981
		if (!ingress) {
982 983
			notify_and_destroy(net, skb, n, classid,
					   dev->qdisc, new);
984
			if (new && !new->ops->attach)
985
				qdisc_refcount_inc(new);
986
			dev->qdisc = new ? : &noop_qdisc;
987 988 989

			if (new && new->ops->attach)
				new->ops->attach(new);
990
		} else {
991
			notify_and_destroy(net, skb, n, classid, old, new);
992
		}
993

994 995
		if (dev->flags & IFF_UP)
			dev_activate(dev);
Linus Torvalds's avatar
Linus Torvalds committed
996
	} else {
997
		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
Linus Torvalds's avatar
Linus Torvalds committed
998

999 1000 1001 1002 1003
		/* Only support running class lockless if parent is lockless */
		if (new && (new->flags & TCQ_F_NOLOCK) &&
		    parent && !(parent->flags & TCQ_F_NOLOCK))
			new->flags &= ~TCQ_F_NOLOCK;

1004 1005
		err = -EOPNOTSUPP;
		if (cops && cops->graft) {
1006 1007
			unsigned long cl = cops->find(parent, classid);

1008
			if (cl) {
1009 1010
				err = cops->graft(parent, cl, new, &old,
						  extack);
1011 1012
			} else {
				NL_SET_ERR_MSG(extack, "Specified class not found");
1013
				err = -ENOENT;
1014
			}
Linus Torvalds's avatar
Linus Torvalds committed
1015
		}
1016
		if (!err)
1017
			notify_and_destroy(net, skb, n, classid, old, new);
Linus Torvalds's avatar
Linus Torvalds committed
1018 1019 1020 1021
	}
	return err;
}

1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
				   struct netlink_ext_ack *extack)
{
	u32 block_index;

	if (tca[TCA_INGRESS_BLOCK]) {
		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);

		if (!block_index) {
			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
			return -EINVAL;
		}
		if (!sch->ops->ingress_block_set) {
			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
			return -EOPNOTSUPP;
		}
		sch->ops->ingress_block_set(sch, block_index);
	}
	if (tca[TCA_EGRESS_BLOCK]) {
		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);

		if (!block_index) {
			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
			return -EINVAL;
		}
		if (!sch->ops->egress_block_set) {
			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
			return -EOPNOTSUPP;
		}
		sch->ops->egress_block_set(sch, block_index);
	}
	return 0;
}

1056 1057 1058 1059
/* lockdep annotation is needed for ingress; egress gets it only for name */
static struct lock_class_key qdisc_tx_lock;
static struct lock_class_key qdisc_rx_lock;

Linus Torvalds's avatar
Linus Torvalds committed
1060 1061 1062 1063 1064 1065
/*
   Allocate and initialize new qdisc.

   Parameters are passed via opt.
 */

1066 1067 1068
static struct Qdisc *qdisc_create(struct net_device *dev,
				  struct netdev_queue *dev_queue,
				  struct Qdisc *p, u32 parent, u32 handle,
1069 1070
				  struct nlattr **tca, int *errp,
				  struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
1071 1072
{
	int err;
1073
	struct nlattr *kind = tca[TCA_KIND];
Linus Torvalds's avatar
Linus Torvalds committed
1074 1075
	struct Qdisc *sch;
	struct Qdisc_ops *ops;
1076
	struct qdisc_size_table *stab;
Linus Torvalds's avatar
Linus Torvalds committed
1077 1078

	ops = qdisc_lookup_ops(kind);
1079
#ifdef CONFIG_MODULES
Linus Torvalds's avatar
Linus Torvalds committed
1080 1081
	if (ops == NULL && kind != NULL) {
		char name[IFNAMSIZ];
1082
		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
Linus Torvalds's avatar
Linus Torvalds committed
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
			/* We dropped the RTNL semaphore in order to
			 * perform the module load.  So, even if we
			 * succeeded in loading the module we have to
			 * tell the caller to replay the request.  We
			 * indicate this using -EAGAIN.
			 * We replay the request because the device may
			 * go away in the mean time.
			 */
			rtnl_unlock();
			request_module("sch_%s", name);
			rtnl_lock();
			ops = qdisc_lookup_ops(kind);
			if (ops != NULL) {
				/* We will try again qdisc_lookup_ops,
				 * so don't keep a reference.
				 */
				module_put(ops->owner);
				err = -EAGAIN;
				goto err_out;
			}
		}
	}
#endif

1107
	err = -ENOENT;
1108 1109
	if (!ops) {
		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
Linus Torvalds's avatar
Linus Torvalds committed
1110
		goto err_out;
1111
	}
Linus Torvalds's avatar
Linus Torvalds committed
1112

1113
	sch = qdisc_alloc(dev_queue, ops, extack);
1114 1115
	if (IS_ERR(sch)) {
		err = PTR_ERR(sch);
Linus Torvalds's avatar
Linus Torvalds committed
1116
		goto err_out2;
1117
	}
Linus Torvalds's avatar
Linus Torvalds committed
1118

1119 1120
	sch->parent = parent;

1121
	if (handle == TC_H_INGRESS) {
Linus Torvalds's avatar
Linus Torvalds committed
1122
		sch->flags |= TCQ_F_INGRESS;
1123
		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1124
		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1125 1126 1127 1128 1129 1130 1131
	} else {
		if (handle == 0) {
			handle = qdisc_alloc_handle(dev);
			err = -ENOMEM;
			if (handle == 0)
				goto err_out3;
		}
1132
		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1133
		if (!netif_is_multiqueue(dev))
1134
			sch->flags |= TCQ_F_ONETXQUEUE;
Linus Torvalds's avatar
Linus Torvalds committed
1135 1136
	}

1137
	sch->handle = handle;
Linus Torvalds's avatar
Linus Torvalds committed
1138

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	/* This exist to keep backward compatible with a userspace
	 * loophole, what allowed userspace to get IFF_NO_QUEUE
	 * facility on older kernels by setting tx_queue_len=0 (prior
	 * to qdisc init), and then forgot to reinit tx_queue_len
	 * before again attaching a qdisc.
	 */
	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
	}

1150 1151 1152 1153
	err = qdisc_block_indexes_set(sch, tca, extack);
	if (err)
		goto err_out3;

1154
	if (ops->init) {
1155
		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1156 1157 1158
		if (err != 0)
			goto err_out5;
	}
1159

1160
	if (tca[TCA_STAB]) {
1161
		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1162 1163 1164
		if (IS_ERR(stab)) {
			err = PTR_ERR(stab);
			goto err_out4;
1165
		}
1166 1167 1168 1169
		rcu_assign_pointer(sch->stab, stab);
	}
	if (tca[TCA_RATE]) {
		seqcount_t *running;
1170

1171
		err = -EOPNOTSUPP;
1172 1173
		if (sch->flags & TCQ_F_MQROOT) {
			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1174
			goto err_out4;
1175
		}
Linus Torvalds's avatar
Linus Torvalds committed
1176

1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
		if (sch->parent != TC_H_ROOT &&
		    !(sch->flags & TCQ_F_INGRESS) &&
		    (!p || !(p->flags & TCQ_F_MQROOT)))
			running = qdisc_root_sleeping_running(sch);
		else
			running = &sch->running;

		err = gen_new_estimator(&sch->bstats,
					sch->cpu_bstats,
					&sch->rate_est,
					NULL,
					running,
					tca[TCA_RATE]);
1190 1191
		if (err) {
			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1192
			goto err_out4;
1193
		}
Linus Torvalds's avatar
Linus Torvalds committed
1194
	}
1195 1196 1197 1198 1199 1200

	qdisc_hash_add(sch, false);

	return sch;

err_out5:
1201
	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1202 1203
	if (ops->destroy)
		ops->destroy(sch);
Linus Torvalds's avatar
Linus Torvalds committed
1204 1205
err_out3:
	dev_put(dev);
1206
	qdisc_free(sch);
Linus Torvalds's avatar
Linus Torvalds committed
1207 1208 1209 1210 1211
err_out2:
	module_put(ops->owner);
err_out:
	*errp = err;
	return NULL;
1212 1213 1214 1215 1216 1217

err_out4:
	/*
	 * Any broken qdiscs that would require a ops->reset() here?
	 * The qdisc was never in action so it shouldn't be necessary.
	 */
1218
	qdisc_put_stab(rtnl_dereference(sch->stab));
1219 1220 1221
	if (ops->destroy)
		ops->destroy(sch);
	goto err_out3;
Linus Torvalds's avatar
Linus Torvalds committed
1222 1223
}

1224 1225
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
			struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
1226
{
1227
	struct qdisc_size_table *ostab, *stab = NULL;
1228
	int err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1229

1230
	if (tca[TCA_OPTIONS]) {
1231 1232
		if (!sch->ops->change) {
			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
Linus Torvalds's avatar
Linus Torvalds committed
1233
			return -EINVAL;
1234
		}
1235 1236 1237 1238
		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
			return -EOPNOTSUPP;
		}
1239
		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
Linus Torvalds's avatar
Linus Torvalds committed
1240 1241 1242
		if (err)
			return err;
	}
1243 1244

	if (tca[TCA_STAB]) {
1245
		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1246 1247 1248 1249
		if (IS_ERR(stab))
			return PTR_ERR(stab);
	}

1250 1251 1252
	ostab = rtnl_dereference(sch->stab);
	rcu_assign_pointer(sch->stab, stab);
	qdisc_put_stab(ostab);
1253

1254
	if (tca[TCA_RATE]) {
1255 1256
		/* NB: ignores errors from replace_estimator
		   because change can't be undone. */
1257 1258
		if (sch->flags & TCQ_F_MQROOT)
			goto out;
1259 1260 1261
		gen_replace_estimator(&sch->bstats,
				      sch->cpu_bstats,
				      &sch->rate_est,
1262 1263
				      NULL,
				      qdisc_root_sleeping_running(sch),
1264
				      tca[TCA_RATE]);
1265 1266
	}
out:
Linus Torvalds's avatar
Linus Torvalds committed
1267 1268 1269
	return 0;
}

Eric Dumazet's avatar
Eric Dumazet committed
1270 1271
struct check_loop_arg {
	struct qdisc_walker	w;
Linus Torvalds's avatar
Linus Torvalds committed
1272 1273 1274 1275
	struct Qdisc		*p;
	int			depth;
};

1276 1277
static int check_loop_fn(struct Qdisc *q, unsigned long cl,
			 struct qdisc_walker *w);
Linus Torvalds's avatar
Linus Torvalds committed
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297

static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
{
	struct check_loop_arg	arg;

	if (q->ops->cl_ops == NULL)
		return 0;

	arg.w.stop = arg.w.skip = arg.w.count = 0;
	arg.w.fn = check_loop_fn;
	arg.depth = depth;
	arg.p = p;
	q->ops->cl_ops->walk(q, &arg.w);
	return arg.w.stop ? -ELOOP : 0;
}

static int
check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
{
	struct Qdisc *leaf;
1298
	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
Linus Torvalds's avatar
Linus Torvalds committed
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309
	struct check_loop_arg *arg = (struct check_loop_arg *)w;

	leaf = cops->leaf(q, cl);
	if (leaf) {
		if (leaf == arg->p || arg->depth > 7)
			return -ELOOP;
		return check_loop(leaf, arg->p, arg->depth + 1);
	}
	return 0;
}

1310
const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1311 1312
	[TCA_KIND]		= { .type = NLA_NUL_STRING,
				    .len = IFNAMSIZ - 1 },
1313 1314 1315 1316 1317 1318 1319 1320 1321
	[TCA_RATE]		= { .type = NLA_BINARY,
				    .len = sizeof(struct tc_estimator) },
	[TCA_STAB]		= { .type = NLA_NESTED },
	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
	[TCA_CHAIN]		= { .type = NLA_U32 },
	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
};

1322 1323 1324 1325
/*
 * Delete/get qdisc.
 */

1326 1327
static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
			struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
1328
{
1329
	struct net *net = sock_net(skb->sk);
1330
	struct tcmsg *tcm = nlmsg_data(n);
1331
	struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds's avatar
Linus Torvalds committed
1332
	struct net_device *dev;
1333
	u32 clid;
Linus Torvalds's avatar
Linus Torvalds committed
1334 1335 1336 1337
	struct Qdisc *q = NULL;
	struct Qdisc *p = NULL;
	int err;

1338
	if ((n->nlmsg_type != RTM_GETQDISC) &&
1339
	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1340 1341
		return -EPERM;

1342 1343
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
			  extack);
1344 1345 1346
	if (err < 0)
		return err;

1347 1348 1349 1350 1351
	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
	if (!dev)
		return -ENODEV;

	clid = tcm->tcm_parent;
Linus Torvalds's avatar
Linus Torvalds committed
1352 1353 1354
	if (clid) {
		if (clid != TC_H_ROOT) {
			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
Eric Dumazet's avatar
Eric Dumazet committed
1355
				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1356 1357
				if (!p) {
					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
Linus Torvalds's avatar
Linus Torvalds committed
1358
					return -ENOENT;
1359
				}
Linus Torvalds's avatar
Linus Torvalds committed
1360
				q = qdisc_leaf(p, clid);
Eric Dumazet's avatar
Eric Dumazet committed
1361 1362
			} else if (dev_ingress_queue(dev)) {
				q = dev_ingress_queue(dev)->qdisc_sleeping;
1363
			}
Linus Torvalds's avatar
Linus Torvalds committed
1364
		} else {
1365
			q = dev->qdisc;
Linus Torvalds's avatar
Linus Torvalds committed
1366
		}
1367 1368
		if (!q) {
			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
Linus Torvalds's avatar
Linus Torvalds committed
1369
			return -ENOENT;
1370
		}
Linus Torvalds's avatar
Linus Torvalds committed
1371

1372 1373
		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
			NL_SET_ERR_MSG(extack, "Invalid handle");
Linus Torvalds's avatar
Linus Torvalds committed
1374
			return -EINVAL;
1375
		}
Linus Torvalds's avatar
Linus Torvalds committed
1376
	} else {
Eric Dumazet's avatar
Eric Dumazet committed
1377
		q = qdisc_lookup(dev, tcm->tcm_handle);
1378 1379
		if (!q) {
			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
Linus Torvalds's avatar
Linus Torvalds committed
1380
			return -ENOENT;
1381
		}
Linus Torvalds's avatar
Linus Torvalds committed
1382 1383
	}

1384 1385
	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
Linus Torvalds's avatar
Linus Torvalds committed
1386
		return -EINVAL;
1387
	}
Linus Torvalds's avatar
Linus Torvalds committed
1388 1389

	if (n->nlmsg_type == RTM_DELQDISC) {
1390 1391
		if (!clid) {
			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
Linus Torvalds's avatar
Linus Torvalds committed
1392
			return -EINVAL;
1393 1394 1395
		}
		if (q->handle == 0) {
			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
Linus Torvalds's avatar
Linus Torvalds committed
1396
			return -ENOENT;
1397 1398
		}
		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
Eric Dumazet's avatar
Eric Dumazet committed
1399
		if (err != 0)
Linus Torvalds's avatar
Linus Torvalds committed
1400 1401
			return err;
	} else {
1402
		qdisc_notify(net, skb, n, clid, NULL, q);
Linus Torvalds's avatar
Linus Torvalds committed
1403 1404 1405 1406 1407
	}
	return 0;
}

/*
Eric Dumazet's avatar
Eric Dumazet committed
1408
 * Create/change qdisc.
Linus Torvalds's avatar
Linus Torvalds committed
1409 1410
 */

1411 1412
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
			   struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
1413
{
1414
	struct net *net = sock_net(skb->sk);
Linus Torvalds's avatar
Linus Torvalds committed
1415
	struct tcmsg *tcm;
1416
	struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds's avatar
Linus Torvalds committed
1417 1418 1419 1420 1421
	struct net_device *dev;
	u32 clid;
	struct Qdisc *q, *p;
	int err;

1422
	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1423 1424
		return -EPERM;

Linus Torvalds's avatar
Linus Torvalds committed
1425 1426
replay:
	/* Reinit, just in case something touches this. */
1427 1428
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
			  extack);
1429 1430 1431
	if (err < 0)
		return err;

1432
	tcm = nlmsg_data(n);
Linus Torvalds's avatar
Linus Torvalds committed
1433 1434 1435
	clid = tcm->tcm_parent;
	q = p = NULL;

Eric Dumazet's avatar
Eric Dumazet committed
1436 1437
	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
	if (!dev)
Linus Torvalds's avatar
Linus Torvalds committed
1438 1439
		return -ENODEV;

1440

Linus Torvalds's avatar
Linus Torvalds committed
1441 1442 1443
	if (clid) {
		if (clid != TC_H_ROOT) {
			if (clid != TC_H_INGRESS) {
Eric Dumazet's avatar
Eric Dumazet committed
1444
				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1445 1446
				if (!p) {
					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
Linus Torvalds's avatar
Linus Torvalds committed
1447
					return -ENOENT;
1448
				}
Linus Torvalds's avatar
Linus Torvalds committed
1449
				q = qdisc_leaf(p, clid);
Eric Dumazet's avatar
Eric Dumazet committed
1450 1451
			} else if (dev_ingress_queue_create(dev)) {
				q = dev_ingress_queue(dev)->qdisc_sleeping;
Linus Torvalds's avatar
Linus Torvalds committed
1452 1453
			}
		} else {
1454
			q = dev->qdisc;
Linus Torvalds's avatar
Linus Torvalds committed
1455 1456 1457 1458 1459 1460 1461 1462
		}

		/* It may be default qdisc, ignore it */
		if (q && q->handle == 0)
			q = NULL;

		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
			if (tcm->tcm_handle) {
1463 1464
				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
Linus Torvalds's avatar
Linus Torvalds committed
1465
					return -EEXIST;
1466 1467 1468
				}
				if (TC_H_MIN(tcm->tcm_handle)) {
					NL_SET_ERR_MSG(extack, "Invalid minor handle");
Linus Torvalds's avatar
Linus Torvalds committed
1469
					return -EINVAL;
1470
				}
Eric Dumazet's avatar
Eric Dumazet committed
1471
				q = qdisc_lookup(dev, tcm->tcm_handle);
1472
				if (!q)
Linus Torvalds's avatar
Linus Torvalds committed
1473
					goto create_n_graft;
1474 1475
				if (n->nlmsg_flags & NLM_F_EXCL) {
					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
Linus Torvalds's avatar
Linus Torvalds committed
1476
					return -EEXIST;
1477
				}
1478
				if (tca[TCA_KIND] &&
1479 1480
				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
Linus Torvalds's avatar
Linus Torvalds committed
1481
					return -EINVAL;
1482
				}
Linus Torvalds's avatar
Linus Torvalds committed
1483
				if (q == p ||
1484 1485
				    (p && check_loop(q, p, 0))) {
					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
Linus Torvalds's avatar
Linus Torvalds committed
1486
					return -ELOOP;
1487
				}
1488
				qdisc_refcount_inc(q);
Linus Torvalds's avatar
Linus Torvalds committed
1489 1490
				goto graft;
			} else {
Eric Dumazet's avatar
Eric Dumazet committed
1491
				if (!q)
Linus Torvalds's avatar
Linus Torvalds committed
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
					goto create_n_graft;

				/* This magic test requires explanation.
				 *
				 *   We know, that some child q is already
				 *   attached to this parent and have choice:
				 *   either to change it or to create/graft new one.
				 *
				 *   1. We are allowed to create/graft only
				 *   if CREATE and REPLACE flags are set.
				 *
				 *   2. If EXCL is set, requestor wanted to say,
				 *   that qdisc tcm_handle is not expected
				 *   to exist, so that we choose create/graft too.
				 *
				 *   3. The last case is when no flags are set.
				 *   Alas, it is sort of hole in API, we
				 *   cannot decide what to do unambiguously.
				 *   For now we select create/graft, if
				 *   user gave KIND, which does not match existing.
				 */
Eric Dumazet's avatar
Eric Dumazet committed
1513 1514 1515
				if ((n->nlmsg_flags & NLM_F_CREATE) &&
				    (n->nlmsg_flags & NLM_F_REPLACE) &&
				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1516 1517
				     (tca[TCA_KIND] &&
				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
Linus Torvalds's avatar
Linus Torvalds committed
1518 1519 1520 1521
					goto create_n_graft;
			}
		}
	} else {
1522 1523
		if (!tcm->tcm_handle) {
			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
Linus Torvalds's avatar
Linus Torvalds committed
1524
			return -EINVAL;
1525
		}
Linus Torvalds's avatar
Linus Torvalds committed
1526 1527 1528 1529
		q = qdisc_lookup(dev, tcm->tcm_handle);
	}

	/* Change qdisc parameters */
1530 1531
	if (!q) {
		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
Linus Torvalds's avatar
Linus Torvalds committed
1532
		return -ENOENT;
1533 1534 1535
	}
	if (n->nlmsg_flags & NLM_F_EXCL) {
		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
Linus Torvalds's avatar
Linus Torvalds committed
1536
		return -EEXIST;
1537 1538 1539
	}
	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
Linus Torvalds's avatar
Linus Torvalds committed
1540
		return -EINVAL;
1541 1542
	}
	err = qdisc_change(q, tca, extack);
Linus Torvalds's avatar
Linus Torvalds committed
1543
	if (err == 0)
1544
		qdisc_notify(net, skb, n, clid, NULL, q);
Linus Torvalds's avatar
Linus Torvalds committed
1545 1546 1547
	return err;

create_n_graft:
1548 1549
	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
Linus Torvalds's avatar
Linus Torvalds committed
1550
		return -ENOENT;
1551
	}
1552
	if (clid == TC_H_INGRESS) {
1553
		if (dev_ingress_queue(dev)) {
1554 1555
			q = qdisc_create(dev, dev_ingress_queue(dev), p,
					 tcm->tcm_parent, tcm->tcm_parent,
1556 1557 1558
					 tca, &err, extack);
		} else {
			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1559
			err = -ENOENT;
1560
		}
1561
	} else {
1562
		struct netdev_queue *dev_queue;
1563 1564

		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1565 1566 1567 1568 1569
			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
		else if (p)
			dev_queue = p->dev_queue;
		else
			dev_queue = netdev_get_tx_queue(dev, 0);
1570

1571
		q = qdisc_create(dev, dev_queue, p,
1572
				 tcm->tcm_parent, tcm->tcm_handle,
1573
				 tca, &err, extack);
1574
	}
Linus Torvalds's avatar
Linus Torvalds committed
1575 1576 1577 1578 1579 1580 1581
	if (q == NULL) {
		if (err == -EAGAIN)
			goto replay;
		return err;
	}

graft:
1582
	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1583 1584 1585 1586
	if (err) {
		if (q)
			qdisc_destroy(q);
		return err;
Linus Torvalds's avatar
Linus Torvalds committed
1587
	}
1588

Linus Torvalds's avatar
Linus Torvalds committed
1589 1590 1591
	return 0;
}

1592 1593
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
			      struct netlink_callback *cb,
1594 1595
			      int *q_idx_p, int s_q_idx, bool recur,
			      bool dump_invisible)
1596 1597 1598
{
	int ret = 0, q_idx = *q_idx_p;
	struct Qdisc *q;
1599
	int b;
1600 1601 1602 1603 1604 1605 1606 1607

	if (!root)
		return 0;

	q = root;
	if (q_idx < s_q_idx) {
		q_idx++;
	} else {
1608
		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1609
		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1610 1611
				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
				  RTM_NEWQDISC) <= 0)
1612 1613 1614
			goto done;
		q_idx++;
	}
1615

1616 1617 1618 1619 1620 1621 1622
	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
	 * itself has already been dumped.
	 *
	 * If we've already dumped the top-level (ingress) qdisc above and the global
	 * qdisc hashtable, we don't want to hit it again
	 */
	if (!qdisc_dev(root) || !recur)
1623 1624
		goto out;

1625
	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1626 1627 1628 1629
		if (q_idx < s_q_idx) {
			q_idx++;
			continue;
		}
1630
		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1631
		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1632 1633
				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
				  RTM_NEWQDISC) <= 0)
1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
			goto done;
		q_idx++;
	}

out:
	*q_idx_p = q_idx;
	return ret;
done:
	ret = -1;
	goto out;
}

Linus Torvalds's avatar
Linus Torvalds committed
1646 1647
static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
{
1648
	struct net *net = sock_net(skb->sk);
Linus Torvalds's avatar
Linus Torvalds committed
1649 1650 1651
	int idx, q_idx;
	int s_idx, s_q_idx;
	struct net_device *dev;
1652 1653 1654
	const struct nlmsghdr *nlh = cb->nlh;
	struct nlattr *tca[TCA_MAX + 1];
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
1655 1656 1657

	s_idx = cb->args[0];
	s_q_idx = q_idx = cb->args[1];
1658

1659
	idx = 0;
1660
	ASSERT_RTNL();
1661

1662 1663
	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
			  rtm_tca_policy, NULL);
1664 1665 1666
	if (err < 0)
		return err;

1667
	for_each_netdev(net, dev) {
1668 1669
		struct netdev_queue *dev_queue;

Linus Torvalds's avatar
Linus Torvalds committed
1670
		if (idx < s_idx)
1671
			goto cont;
Linus Torvalds's avatar
Linus Torvalds committed
1672 1673 1674
		if (idx > s_idx)
			s_q_idx = 0;
		q_idx = 0;
1675

1676
		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1677
				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1678 1679
			goto done;

1680 1681 1682
		dev_queue = dev_ingress_queue(dev);
		if (dev_queue &&
		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1683 1684
				       &q_idx, s_q_idx, false,
				       tca[TCA_DUMP_INVISIBLE]) < 0)
1685 1686
			goto done;

1687 1688
cont:
		idx++;
Linus Torvalds's avatar
Linus Torvalds committed
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
	}

done:
	cb->args[0] = idx;
	cb->args[1] = q_idx;

	return skb->len;
}



/************************************************
 *	Traffic classes manipulation.		*
 ************************************************/

1704 1705 1706 1707 1708 1709 1710 1711 1712
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
			  unsigned long cl,
			  u32 portid, u32 seq, u16 flags, int event)
{
	struct tcmsg *tcm;
	struct nlmsghdr  *nlh;
	unsigned char *b = skb_tail_pointer(skb);
	struct gnet_dump d;
	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
Linus Torvalds's avatar
Linus Torvalds committed
1713

1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
	cond_resched();
	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
	if (!nlh)
		goto out_nlmsg_trim;
	tcm = nlmsg_data(nlh);
	tcm->tcm_family = AF_UNSPEC;
	tcm->tcm__pad1 = 0;
	tcm->tcm__pad2 = 0;
	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
	tcm->tcm_parent = q->handle;
	tcm->tcm_handle = q->handle;
	tcm->tcm_info = 0;
	if (nla_put_string(skb, TCA_KIND, q->ops->id))
		goto nla_put_failure;
	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
		goto nla_put_failure;

	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
					 NULL, &d, TCA_PAD) < 0)
		goto nla_put_failure;

	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
		goto nla_put_failure;

	if (gnet_stats_finish_copy(&d) < 0)
		goto nla_put_failure;

	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
	return skb->len;

out_nlmsg_trim:
nla_put_failure:
	nlmsg_trim(skb, b);
	return -1;
}

static int tclass_notify(struct net *net, struct sk_buff *oskb,
			 struct nlmsghdr *n, struct Qdisc *q,
			 unsigned long cl, int event)
{
	struct sk_buff *skb;
	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;

	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
	if (!skb)
		return -ENOBUFS;

	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
		kfree_skb(skb);
		return -EINVAL;
	}

	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
			      n->nlmsg_flags & NLM_F_ECHO);
}
Linus Torvalds's avatar
Linus Torvalds committed
1769

1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801
static int tclass_del_notify(struct net *net,
			     const struct Qdisc_class_ops *cops,
			     struct sk_buff *oskb, struct nlmsghdr *n,
			     struct Qdisc *q, unsigned long cl)
{
	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
	struct sk_buff *skb;
	int err = 0;

	if (!cops->delete)
		return -EOPNOTSUPP;

	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
	if (!skb)
		return -ENOBUFS;

	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
			   RTM_DELTCLASS) < 0) {
		kfree_skb(skb);
		return -EINVAL;
	}

	err = cops->delete(q, cl);
	if (err) {
		kfree_skb(skb);
		return err;
	}

	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
			      n->nlmsg_flags & NLM_F_ECHO);
}

1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
#ifdef CONFIG_NET_CLS

struct tcf_bind_args {
	struct tcf_walker w;
	u32 classid;
	unsigned long cl;
};

static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
{
	struct tcf_bind_args *a = (void *)arg;

	if (tp->ops->bind_class) {
1815 1816 1817
		struct Qdisc *q = tcf_block_q(tp->chain->block);

		sch_tree_lock(q);
1818
		tp->ops->bind_class(n, a->classid, a->cl);
1819
		sch_tree_unlock(q);
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
	}
	return 0;
}

static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
			   unsigned long new_cl)
{
	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
	struct tcf_block *block;
	struct tcf_chain *chain;
	unsigned long cl;

	cl = cops->find(q, portid);
	if (!cl)
		return;
1835 1836
	if (!cops->tcf_block)
		return;
1837
	block = cops->tcf_block(q, cl, NULL);
1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
	if (!block)
		return;
	list_for_each_entry(chain, &block->chain_list, list) {
		struct tcf_proto *tp;

		for (tp = rtnl_dereference(chain->filter_chain);
		     tp; tp = rtnl_dereference(tp->next)) {
			struct tcf_bind_args arg = {};

			arg.w.fn = tcf_node_bind;
			arg.classid = clid;
			arg.cl = new_cl;
			tp->ops->walk(tp, &arg.w);
		}
	}
}

#else

static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
			   unsigned long new_cl)
{
}

#endif

1864 1865
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
			 struct netlink_ext_ack *extack)
Linus Torvalds's avatar
Linus Torvalds committed
1866
{
1867
	struct net *net = sock_net(skb->sk);
1868
	struct tcmsg *tcm = nlmsg_data(n);
1869
	struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds's avatar
Linus Torvalds committed
1870 1871
	struct net_device *dev;
	struct Qdisc *q = NULL;
1872
	const struct Qdisc_class_ops *cops;
Linus Torvalds's avatar
Linus Torvalds committed
1873 1874
	unsigned long cl = 0;
	unsigned long new_cl;
1875 1876 1877
	u32 portid;
	u32 clid;
	u32 qid;
Linus Torvalds's avatar
Linus Torvalds committed
1878 1879
	int err;

1880
	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1881
	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1882 1883
		return -EPERM;

1884 1885
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
			  extack);
1886 1887 1888
	if (err < 0)
		return err;

1889 1890 1891 1892
	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
	if (!dev)
		return -ENODEV;

Linus Torvalds's avatar
Linus Torvalds committed
1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
	/*
	   parent == TC_H_UNSPEC - unspecified parent.
	   parent == TC_H_ROOT   - class is root, which has no parent.
	   parent == X:0	 - parent is root class.
	   parent == X:Y	 - parent is a node in hierarchy.
	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.

	   handle == 0:0	 - generate handle from kernel pool.
	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
	   handle == X:Y	 - clear.
	   handle == X:0	 - root class.
	 */

	/* Step 1. Determine qdisc handle X:0 */

1908 1909 1910 1911
	portid = tcm->tcm_parent;
	clid = tcm->tcm_handle;
	qid = TC_H_MAJ(clid);

1912 1913
	if (portid != TC_H_ROOT) {
		u32 qid1 = TC_H_MAJ(portid);
Linus Torvalds's avatar
Linus Torvalds committed
1914 1915 1916 1917 1918 1919 1920 1921

		if (qid && qid1) {
			/* If both majors are known, they must be identical. */
			if (qid != qid1)
				return -EINVAL;
		} else if (qid1) {
			qid = qid1;
		} else if (qid == 0)
1922
			qid = dev->qdisc->handle;
Linus Torvalds's avatar
Linus Torvalds committed
1923 1924

		/* Now qid is genuine qdisc handle consistent
Eric Dumazet's avatar
Eric Dumazet committed
1925 1926
		 * both with parent and child.
		 *
1927
		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
Linus Torvalds's avatar
Linus Torvalds committed
1928
		 */
1929 1930
		if (portid)
			portid = TC_H_MAKE(qid, portid);
Linus Torvalds's avatar
Linus Torvalds committed
1931 1932
	} else {
		if (qid == 0)
1933
			qid = dev->qdisc->handle;
Linus Torvalds's avatar
Linus Torvalds committed
1934 1935 1936
	}

	/* OK. Locate qdisc */
Eric Dumazet's avatar
Eric Dumazet committed
1937 1938
	q = qdisc_lookup(dev, qid);
	if (!q)
Linus Torvalds's avatar
Linus Torvalds committed
1939 1940 1941 1942 1943 1944 1945 1946 1947
		return -ENOENT;

	/* An check that it supports classes */
	cops = q->ops->cl_ops;
	if (cops == NULL)
		return -EINVAL;

	/* Now try to get class */
	if (clid == 0) {
1948
		if (portid == TC_H_ROOT)
Linus Torvalds's avatar
Linus Torvalds committed
1949 1950 1951 1952 1953
			clid = qid;
	} else
		clid = TC_H_MAKE(qid, clid);

	if (clid)
1954
		cl = cops->find(q, clid);
Linus Torvalds's avatar
Linus Torvalds committed
1955 1956 1957

	if (cl == 0) {
		err = -ENOENT;
Eric Dumazet's avatar
Eric Dumazet committed
1958 1959
		if (n->nlmsg_type != RTM_NEWTCLASS ||
		    !(n->nlmsg_flags & NLM_F_CREATE))
Linus Torvalds's avatar
Linus Torvalds committed
1960 1961 1962
			goto out;
	} else {
		switch (n->nlmsg_type) {
1963
		case RTM_NEWTCLASS:
Linus Torvalds's avatar
Linus Torvalds committed
1964
			err = -EEXIST;
Eric Dumazet's avatar
Eric Dumazet committed
1965
			if (n->nlmsg_flags & NLM_F_EXCL)
Linus Torvalds's avatar
Linus Torvalds committed
1966 1967 1968
				goto out;
			break;
		case RTM_DELTCLASS:
1969
			err = tclass_del_notify(net, cops, skb, n, q, cl);
1970 1971
			/* Unbind the class with flilters with 0 */
			tc_bind_tclass(q, portid, clid, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1972 1973
			goto out;
		case RTM_GETTCLASS:
1974
			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
Linus Torvalds's avatar
Linus Torvalds committed
1975 1976 1977 1978 1979 1980 1981
			goto out;
		default:
			err = -EINVAL;
			goto out;
		}
	}

1982 1983 1984 1985 1986
	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
		return -EOPNOTSUPP;
	}

Linus Torvalds's avatar
Linus Torvalds committed
1987
	new_cl = cl;
1988 1989
	err = -EOPNOTSUPP;
	if (cops->change)
1990
		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1991
	if (err == 0) {
1992
		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1993 1994 1995 1996
		/* We just create a new class, need to do reverse binding. */
		if (cl != new_cl)
			tc_bind_tclass(q, portid, clid, new_cl);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1997 1998 1999 2000
out:
	return err;
}

Eric Dumazet's avatar
Eric Dumazet committed
2001 2002 2003 2004
struct qdisc_dump_args {
	struct qdisc_walker	w;
	struct sk_buff		*skb;
	struct netlink_callback	*cb;
Linus Torvalds's avatar
Linus Torvalds committed
2005 2006
};

2007 2008
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
			    struct qdisc_walker *arg)
Linus Torvalds's avatar
Linus Torvalds committed
2009 2010 2011
{
	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;

2012
	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2013 2014
			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
			      RTM_NEWTCLASS);
Linus Torvalds's avatar
Linus Torvalds committed
2015 2016
}

2017 2018 2019 2020 2021 2022
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
				struct tcmsg *tcm, struct netlink_callback *cb,
				int *t_p, int s_t)
{
	struct qdisc_dump_args arg;

2023
	if (tc_qdisc_dump_ignore(q, false) ||
2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050
	    *t_p < s_t || !q->ops->cl_ops ||
	    (tcm->tcm_parent &&
	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
		(*t_p)++;
		return 0;
	}
	if (*t_p > s_t)
		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
	arg.w.fn = qdisc_class_dump;
	arg.skb = skb;
	arg.cb = cb;
	arg.w.stop  = 0;
	arg.w.skip = cb->args[1];
	arg.w.count = 0;
	q->ops->cl_ops->walk(q, &arg.w);
	cb->args[1] = arg.w.count;
	if (arg.w.stop)
		return -1;
	(*t_p)++;
	return 0;
}

static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
			       struct tcmsg *tcm, struct netlink_callback *cb,
			       int *t_p, int s_t)
{
	struct Qdisc *q;
2051
	int b;
2052 2053 2054 2055 2056 2057 2058

	if (!root)
		return 0;

	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
		return -1;

2059 2060 2061
	if (!qdisc_dev(root))
		return 0;

2062 2063
	if (tcm->tcm_parent) {
		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2064 2065
		if (q && q != root &&
		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2066 2067 2068
			return -1;
		return 0;
	}
2069
	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2070 2071 2072 2073 2074 2075 2076
		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
			return -1;
	}

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
2077 2078
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
{
2079
	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2080
	struct net *net = sock_net(skb->sk);
2081
	struct netdev_queue *dev_queue;
Linus Torvalds's avatar
Linus Torvalds committed
2082
	struct net_device *dev;
2083
	int t, s_t;
Linus Torvalds's avatar
Linus Torvalds committed
2084

2085
	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
Linus Torvalds's avatar
Linus Torvalds committed
2086
		return 0;
Eric Dumazet's avatar
Eric Dumazet committed
2087 2088
	dev = dev_get_by_index(net, tcm->tcm_ifindex);
	if (!dev)
Linus Torvalds's avatar
Linus Torvalds committed
2089 2090 2091 2092 2093
		return 0;

	s_t = cb->args[0];
	t = 0;

2094
	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2095 2096
		goto done;

2097 2098 2099 2100
	dev_queue = dev_ingress_queue(dev);
	if (dev_queue &&
	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
				&t, s_t) < 0)
2101
		goto done;
Linus Torvalds's avatar
Linus Torvalds committed
2102

2103
done:
Linus Torvalds's avatar
Linus Torvalds committed
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113
	cb->args[0] = t;

	dev_put(dev);
	return skb->len;
}

#ifdef CONFIG_PROC_FS
static int psched_show(struct seq_file *seq, void *v)
{
	seq_printf(seq, "%08x %08x %08x %08x\n",
2114
		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
Patrick McHardy's avatar
Patrick McHardy committed
2115
		   1000000,
2116
		   (u32)NSEC_PER_SEC / hrtimer_resolution);
Linus Torvalds's avatar
Linus Torvalds committed
2117 2118 2119 2120

	return 0;
}

2121 2122 2123 2124
static int __net_init psched_net_init(struct net *net)
{
	struct proc_dir_entry *e;

2125
	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2126 2127 2128 2129 2130 2131 2132 2133
	if (e == NULL)
		return -ENOMEM;

	return 0;
}

static void __net_exit psched_net_exit(struct net *net)
{
2134
	remove_proc_entry("psched", net->proc_net);
2135 2136 2137 2138 2139 2140 2141 2142 2143 2144
}
#else
static int __net_init psched_net_init(struct net *net)
{
	return 0;
}

static void __net_exit psched_net_exit(struct net *net)
{
}
Linus Torvalds's avatar
Linus Torvalds committed
2145 2146
#endif

2147 2148 2149 2150 2151
static struct pernet_operations psched_net_ops = {
	.init = psched_net_init,
	.exit = psched_net_exit,
};

Linus Torvalds's avatar
Linus Torvalds committed
2152 2153
static int __init pktsched_init(void)
{
2154 2155 2156 2157
	int err;

	err = register_pernet_subsys(&psched_net_ops);
	if (err) {
Eric Dumazet's avatar
Eric Dumazet committed
2158
		pr_err("pktsched_init: "
2159 2160 2161 2162
		       "cannot initialize per netns operations\n");
		return err;
	}

2163
	register_qdisc(&pfifo_fast_ops);
Linus Torvalds's avatar
Linus Torvalds committed
2164 2165
	register_qdisc(&pfifo_qdisc_ops);
	register_qdisc(&bfifo_qdisc_ops);
2166
	register_qdisc(&pfifo_head_drop_qdisc_ops);
2167
	register_qdisc(&mq_qdisc_ops);
2168
	register_qdisc(&noqueue_qdisc_ops);
Linus Torvalds's avatar
Linus Torvalds committed
2169

2170 2171
	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2172
	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2173 2174 2175
		      0);
	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2176
	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2177
		      0);
2178

Linus Torvalds's avatar
Linus Torvalds committed
2179 2180 2181 2182
	return 0;
}

subsys_initcall(pktsched_init);