cxl: Calculate region bandwidth of targets with shared upstream link

The current bandwidth calculation aggregates all the targets. This simple method does not take into account where multiple targets sharing under a switch or a root port where the aggregated bandwidth can be greater than the upstream link of the switch. To accurately account for the shared upstream uplink cases, a new update function is introduced by walking from the leaves to the root of the hierarchy and clamp the bandwidth in the process as needed. This process is done when all the targets for a region are present but before the final values are send to the HMAT handling code cached access_coordinate targets. The original perf calculation path was kept to calculate the latency performance data that does not require the shared link consideration. The shared upstream link calculation is done as a second pass when all the endpoints have arrived. Testing is done via qemu with CXL hierarchy. run_qemu[1] is modified to support several CXL hierarchy layouts. The following layouts are tested: HB: Host Bridge RP: Root Port SW: Switch EP: End Point 2 HB 2 RP 2 EP: resulting bandwidth: 624 1 HB 2 RP 2 EP: resulting bandwidth: 624 2 HB 2 RP 2 SW 4 EP: resulting bandwidth: 624 Current testing, perf number from SRAT/HMAT is hacked into the kernel code. However with new QEMU support of Generic Target Port that's incoming, the perf data injection is no longer needed. [1]: https://github.com/pmem/run_qemuSuggested-by: Jonathan Cameron <jonathan.cameron@huawei.com> Link: https://lore.kernel.org/linux-cxl/20240501152503.00002e60@Huawei.com/Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Alison Schofield <alison.schofield@intel.com> Acked-by: Dan Williams <dan.j.williams@intel.com> Link: https://patch.msgid.link/20240904001316.1688225-3-dave.jiang@intel.comSigned-off-by: Dave Jiang <dave.jiang@intel.com>

cxl: Calculate region bandwidth of targets with shared upstream link
The current bandwidth calculation aggregates all the targets. This simple method does not take into account where multiple targets sharing under a switch or a root port where the aggregated bandwidth can be greater than the upstream link of the switch. To accurately account for the shared upstream uplink cases, a new update function is introduced by walking from the leaves to the root of the hierarchy and clamp the bandwidth in the process as needed. This process is done when all the targets for a region are present but before the final values are send to the HMAT handling code cached access_coordinate targets. The original perf calculation path was kept to calculate the latency performance data that does not require the shared link consideration. The shared upstream link calculation is done as a second pass when all the endpoints have arrived. Testing is done via qemu with CXL hierarchy. run_qemu[1] is modified to support several CXL hierarchy layouts. The following layouts are tested: HB: Host Bridge RP: Root Port SW: Switch EP: End Point 2 HB 2 RP 2 EP: resulting bandwidth: 624 1 HB 2 RP 2 EP: resulting bandwidth: 624 2 HB 2 RP 2 SW 4 EP: resulting bandwidth: 624 Current testing, perf number from SRAT/HMAT is hacked into the kernel code. However with new QEMU support of Generic Target Port that's incoming, the perf data injection is no longer needed. [1]: https://github.com/pmem/run_qemuSuggested-by: Jonathan Cameron <jonathan.cameron@huawei.com> Link: https://lore.kernel.org/linux-cxl/20240501152503.00002e60@Huawei.com/Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Alison Schofield <alison.schofield@intel.com> Acked-by: Dan Williams <dan.j.williams@intel.com> Link: https://patch.msgid.link/20240904001316.1688225-3-dave.jiang@intel.comSigned-off-by: Dave Jiang <dave.jiang@intel.com>
a5ab0de0 · Dave Jiang · e91be3ed · a5ab0de0 · a5ab0de0 · a5ab0de0
Commit a5ab0de0 authored Sep 03, 2024 by Dave Jiang
6 changed files
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -103,9 +103,11 @@ enum cxl_poison_trace_type {
 };

 long cxl_pci_get_latency(struct pci_dev *pdev);
-
+int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c);
 int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr,
 				       enum access_coordinate_class access);
 bool cxl_need_node_perf_attrs_update(int nid);
+int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
+					struct access_coordinate *c);

 #endif /* __CXL_CORE_H__ */
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1031,3 +1031,26 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port)
 				     __cxl_endpoint_decoder_reset_detected);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, CXL);
+
+int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
+{
+	int speed, bw;
+	u16 lnksta;
+	u32 width;
+
+	speed = pcie_link_speed_mbps(pdev);
+	if (speed < 0)
+		return speed;
+	speed /= BITS_PER_BYTE;
+
+	pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
+	width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta);
+	bw = speed * width;
+
+	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+		c[i].read_bandwidth = bw;
+		c[i].write_bandwidth = bw;
+	}
+
+	return 0;
+}
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -2237,6 +2237,26 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port,
 }
 EXPORT_SYMBOL_NS_GPL(cxl_endpoint_get_perf_coordinates, CXL);

+int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
+					struct access_coordinate *c)
+{
+	struct cxl_dport *dport = port->parent_dport;
+
+	/* Check this port is connected to a switch DSP and not an RP */
+	if (parent_port_is_cxl_root(to_cxl_port(port->dev.parent)))
+		return -ENODEV;
+
+	if (!coordinates_valid(dport->coord))
+		return -EINVAL;
+
+	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+		c[i].read_bandwidth = dport->coord[i].read_bandwidth;
+		c[i].write_bandwidth = dport->coord[i].write_bandwidth;
+	}
+
+	return 0;
+}
+
 /* for user tooling to ensure port disable work has completed */
 static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count)
 {

--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1983,6 +1983,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
 		 * then the region is already committed.
 		 */
 		p->state = CXL_CONFIG_COMMIT;
+		cxl_region_shared_upstream_bandwidth_update(cxlr);

 		return 0;
 	}
@@ -2004,6 +2005,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
 		if (rc)
 			return rc;
 		p->state = CXL_CONFIG_ACTIVE;
+		cxl_region_shared_upstream_bandwidth_update(cxlr);
 	}

 	cxled->cxld.interleave_ways = p->interleave_ways;

--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -891,6 +891,7 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port,
 				      struct access_coordinate *coord);
 void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 				    struct cxl_endpoint_decoder *cxled);
+void cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr);

 void cxl_memdev_update_perf(struct cxl_memdev *cxlmd);