Commit 44306f15 authored by Jianxin Xiong's avatar Jianxin Xiong Committed by Doug Ledford

IB/hfi1: Reduce kernel context pio buffer allocation

The pio buffers were pooled evenly among all kernel contexts and
user contexts. However, the demand from kernel contexts is much
lower than user contexts. This patch reduces the allocation for
kernel contexts and thus makes more credits available for PSM,
helping performance. This is especially useful on high core-count
systems where large numbers of contexts are used.

A new context type SC_VL15 is added to distinguish the context used
for VL15 from other kernel contexts. The reason is that VL15 needs
to support 2KB sized packet while other kernel contexts need only
support packets up to the size determined by "piothreshold", which
has a default value of 256.

The new allocation method allows triple buffering of largest pio
packets configured for these contexts. This is sufficient to maintain
verbs performance. The largest pio packet size is 2048B for VL15
and "piothreshold" for other kernel contexts. A cap is applied to
"piothreshold" to avoid excessive buffer allocation.

The special case that SDMA is disable is handled differently. In
that case, the original pooling allocation is used to better
support the much higher pio traffic.

Notice that if adaptive pio is disabled (piothreshold==0), the pio
buffer size doesn't matter for non-VL15 kernel send contexts when
SDMA is enabled because pio is not used at all on these contexts
and thus the new allocation is still valid. If SDMA is disabled then
pooling allocation is used as mentioned in previous paragraph.

Adjustment is also made to the calculation of the credit return
threshold for the kernel contexts. Instead of purely based on
the MTU size, a percentage based threshold is also considered and
the smaller one of the two is chosen. This is necessary to ensure
that with the reduced buffer allocation credits are returned in
time to avoid unnecessary stall in the send path.
Reviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: default avatarDean Luick <dean.luick@intel.com>
Reviewed-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: default avatarMark Debbage <mark.debbage@intel.com>
Reviewed-by: default avatarJubin John <jubin.john@intel.com>
Signed-off-by: default avatarJianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 0852d241
...@@ -5661,7 +5661,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) ...@@ -5661,7 +5661,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
sci = &dd->send_contexts[sw_index]; sci = &dd->send_contexts[sw_index];
/* there is no information for user (PSM) and ack contexts */ /* there is no information for user (PSM) and ack contexts */
if (sci->type != SC_KERNEL) if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
return -1; return -1;
sc = sci->sc; sc = sci->sc;
...@@ -9627,6 +9627,7 @@ static void set_send_length(struct hfi1_pportdata *ppd) ...@@ -9627,6 +9627,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
& SEND_LEN_CHECK1_LEN_VL15_MASK) << & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
SEND_LEN_CHECK1_LEN_VL15_SHIFT; SEND_LEN_CHECK1_LEN_VL15_SHIFT;
int i; int i;
u32 thres;
for (i = 0; i < ppd->vls_supported; i++) { for (i = 0; i < ppd->vls_supported; i++) {
if (dd->vld[i].mtu > maxvlmtu) if (dd->vld[i].mtu > maxvlmtu)
...@@ -9645,16 +9646,17 @@ static void set_send_length(struct hfi1_pportdata *ppd) ...@@ -9645,16 +9646,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
/* adjust kernel credit return thresholds based on new MTUs */ /* adjust kernel credit return thresholds based on new MTUs */
/* all kernel receive contexts have the same hdrqentsize */ /* all kernel receive contexts have the same hdrqentsize */
for (i = 0; i < ppd->vls_supported; i++) { for (i = 0; i < ppd->vls_supported; i++) {
sc_set_cr_threshold(dd->vld[i].sc, thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
sc_mtu_to_threshold(dd->vld[i].sc, sc_mtu_to_threshold(dd->vld[i].sc,
dd->vld[i].mtu, dd->vld[i].mtu,
dd->rcd[0]-> dd->rcd[0]->rcvhdrqentsize));
rcvhdrqentsize)); sc_set_cr_threshold(dd->vld[i].sc, thres);
} }
sc_set_cr_threshold(dd->vld[15].sc, thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
sc_mtu_to_threshold(dd->vld[15].sc, sc_mtu_to_threshold(dd->vld[15].sc,
dd->vld[15].mtu, dd->vld[15].mtu,
dd->rcd[0]->rcvhdrqentsize)); dd->rcd[0]->rcvhdrqentsize));
sc_set_cr_threshold(dd->vld[15].sc, thres);
/* Adjust maximum MTU for the port in DC */ /* Adjust maximum MTU for the port in DC */
dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
...@@ -12728,12 +12730,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) ...@@ -12728,12 +12730,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
dd->num_send_contexts = ret; dd->num_send_contexts = ret;
dd_dev_info( dd_dev_info(
dd, dd,
"send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n", "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
dd->chip_send_contexts, dd->chip_send_contexts,
dd->num_send_contexts, dd->num_send_contexts,
dd->sc_sizes[SC_KERNEL].count, dd->sc_sizes[SC_KERNEL].count,
dd->sc_sizes[SC_ACK].count, dd->sc_sizes[SC_ACK].count,
dd->sc_sizes[SC_USER].count); dd->sc_sizes[SC_USER].count,
dd->sc_sizes[SC_VL15].count);
ret = 0; /* success */ ret = 0; /* success */
} }
......
...@@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp) ...@@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp)
goto bail; goto bail;
} }
/* can only use kernel contexts */ /* can only use kernel contexts */
if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) { if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
dd->send_contexts[dp->sw_index].type != SC_VL15) {
ret = -EINVAL; ret = -EINVAL;
goto bail; goto bail;
} }
......
...@@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op) ...@@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
/* Send Context Size (SCS) wildcards */ /* Send Context Size (SCS) wildcards */
#define SCS_POOL_0 -1 #define SCS_POOL_0 -1
#define SCS_POOL_1 -2 #define SCS_POOL_1 -2
/* Send Context Count (SCC) wildcards */ /* Send Context Count (SCC) wildcards */
#define SCC_PER_VL -1 #define SCC_PER_VL -1
#define SCC_PER_CPU -2 #define SCC_PER_CPU -2
#define SCC_PER_KRCVQ -3 #define SCC_PER_KRCVQ -3
#define SCC_ACK_CREDITS 32
/* Send Context Size (SCS) constants */
#define SCS_ACK_CREDITS 32
#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */
#define PIO_THRESHOLD_CEILING 4096
#define PIO_WAIT_BATCH_SIZE 5 #define PIO_WAIT_BATCH_SIZE 5
/* default send context sizes */ /* default send context sizes */
static struct sc_config_sizes sc_config_sizes[SC_MAX] = { static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
[SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */
.count = SCC_PER_VL },/* one per NUMA */ .count = SCC_PER_VL }, /* one per NUMA */
[SC_ACK] = { .size = SCC_ACK_CREDITS, [SC_ACK] = { .size = SCS_ACK_CREDITS,
.count = SCC_PER_KRCVQ }, .count = SCC_PER_KRCVQ },
[SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */
.count = SCC_PER_CPU }, /* one per CPU */ .count = SCC_PER_CPU }, /* one per CPU */
[SC_VL15] = { .size = SCS_VL15_CREDITS,
.count = 1 },
}; };
...@@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc) ...@@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc)
static const char *sc_type_names[SC_MAX] = { static const char *sc_type_names[SC_MAX] = {
"kernel", "kernel",
"ack", "ack",
"user" "user",
"vl15"
}; };
static const char *sc_type_name(int index) static const char *sc_type_name(int index)
...@@ -230,6 +238,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) ...@@ -230,6 +238,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
int extra; int extra;
int i; int i;
/*
* When SDMA is enabled, kernel context pio packet size is capped by
* "piothreshold". Reduce pio buffer allocation for kernel context by
* setting it to a fixed size. The allocation allows 3-deep buffering
* of the largest pio packets plus up to 128 bytes header, sufficient
* to maintain verbs performance.
*
* When SDMA is disabled, keep the default pooling allocation.
*/
if (HFI1_CAP_IS_KSET(SDMA)) {
u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
piothreshold : PIO_THRESHOLD_CEILING;
sc_config_sizes[SC_KERNEL].size =
3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
}
/* /*
* Step 0: * Step 0:
* - copy the centipercents/absolute sizes from the pool config * - copy the centipercents/absolute sizes from the pool config
...@@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) ...@@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
if (i == SC_ACK) { if (i == SC_ACK) {
count = dd->n_krcv_queues; count = dd->n_krcv_queues;
} else if (i == SC_KERNEL) { } else if (i == SC_KERNEL) {
count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */; count = INIT_SC_PER_VL * num_vls;
} else if (count == SCC_PER_CPU) { } else if (count == SCC_PER_CPU) {
count = dd->num_rcv_contexts - dd->n_krcv_queues; count = dd->num_rcv_contexts - dd->n_krcv_queues;
} else if (count < 0) { } else if (count < 0) {
...@@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) ...@@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
* Return value is what to write into the CSR: trigger return when * Return value is what to write into the CSR: trigger return when
* unreturned credits pass this count. * unreturned credits pass this count.
*/ */
static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
{ {
return (sc->credits * percent) / 100; return (sc->credits * percent) / 100;
} }
...@@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, ...@@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
* For Ack contexts, set a threshold for half the credits. * For Ack contexts, set a threshold for half the credits.
* For User contexts use the given percentage. This has been * For User contexts use the given percentage. This has been
* sanitized on driver start-up. * sanitized on driver start-up.
* For Kernel contexts, use the default MTU plus a header. * For Kernel contexts, use the default MTU plus a header
* or half the credits, whichever is smaller. This should
* work for both the 3-deep buffering allocation and the
* pooling allocation.
*/ */
if (type == SC_ACK) { if (type == SC_ACK) {
thresh = sc_percent_to_threshold(sc, 50); thresh = sc_percent_to_threshold(sc, 50);
...@@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, ...@@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
thresh = sc_percent_to_threshold(sc, thresh = sc_percent_to_threshold(sc,
user_credit_return_threshold); user_credit_return_threshold);
} else { /* kernel */ } else { /* kernel */
thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize); thresh = min(sc_percent_to_threshold(sc, 50),
sc_mtu_to_threshold(sc, hfi1_max_mtu,
hdrqentsize));
} }
reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
/* add in early return */ /* add in early return */
...@@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc) ...@@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc)
unsigned long flags; unsigned long flags;
unsigned i, n = 0; unsigned i, n = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL) if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
return; return;
list = &sc->piowait; list = &sc->piowait;
/* /*
...@@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd) ...@@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd)
u32 ctxt; u32 ctxt;
struct hfi1_pportdata *ppd = dd->pport; struct hfi1_pportdata *ppd = dd->pport;
dd->vld[15].sc = sc_alloc(dd, SC_KERNEL, dd->vld[15].sc = sc_alloc(dd, SC_VL15,
dd->rcd[0]->rcvhdrqentsize, dd->node); dd->rcd[0]->rcvhdrqentsize, dd->node);
if (!dd->vld[15].sc) if (!dd->vld[15].sc)
goto nomem; goto nomem;
......
...@@ -51,7 +51,8 @@ ...@@ -51,7 +51,8 @@
#define SC_KERNEL 0 #define SC_KERNEL 0
#define SC_ACK 1 #define SC_ACK 1
#define SC_USER 2 #define SC_USER 2
#define SC_MAX 3 #define SC_VL15 3
#define SC_MAX 4
/* invalid send context index */ /* invalid send context index */
#define INVALID_SCI 0xff #define INVALID_SCI 0xff
...@@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); ...@@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
void sc_add_credit_return_intr(struct send_context *sc); void sc_add_credit_return_intr(struct send_context *sc);
void sc_del_credit_return_intr(struct send_context *sc); void sc_del_credit_return_intr(struct send_context *sc);
void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
void sc_wait(struct hfi1_devdata *dd); void sc_wait(struct hfi1_devdata *dd);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment