drm/nouveau: implement explicitly coherent BOs

Allow nouveau_bo_new() to recognize the TTM_PL_FLAG_UNCACHED flag, which means that we want the allocated BO to be perfectly coherent between the CPU and GPU. This is useful on non-coherent architectures for which we do not want to manually sync some rarely-accessed buffers: typically, fences and pushbuffers. A TTM BO allocated with the TTM_PL_FLAG_UNCACHED on a non-coherent architecture will be populated using the DMA API, and accesses to it performed using the coherent mapping performed by dma_alloc_coherent(). Signed-off-by: Alexandre Courbot <acourbot@nvidia.com> Signed-off-by: Ben Skeggs <bskeggs@redhat.com>

drm/nouveau: implement explicitly coherent BOs
Allow nouveau_bo_new() to recognize the TTM_PL_FLAG_UNCACHED flag, which means that we want the allocated BO to be perfectly coherent between the CPU and GPU. This is useful on non-coherent architectures for which we do not want to manually sync some rarely-accessed buffers: typically, fences and pushbuffers. A TTM BO allocated with the TTM_PL_FLAG_UNCACHED on a non-coherent architecture will be populated using the DMA API, and accesses to it performed using the coherent mapping performed by dma_alloc_coherent(). Signed-off-by: Alexandre Courbot <acourbot@nvidia.com> Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
c3a0c771 · Alexandre Courbot · Ben Skeggs · c5d7ddf7 · c3a0c771 · c3a0c771
Commit c3a0c771 authored Oct 27, 2014 by Alexandre Courbot Committed by Ben Skeggs Dec 02, 2014
Show whitespace changes
Inline Side-by-side

Showing with 73 additions and 8 deletions

drivers/gpu/drm/nouveau/nouveau_bo.c drivers/gpu/drm/nouveau/nouveau_bo.c +72 -8

drivers/gpu/drm/nouveau/nouveau_bo.h drivers/gpu/drm/nouveau/nouveau_bo.h +1 -0

No files found.
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -214,6 +214,9 @@ nouveau_bo_new(struct drm_device *dev, int size, int align,
 	nvbo->tile_flags = tile_flags;
 	nvbo->bo.bdev = &drm->ttm.bdev;

+	if (!nv_device_is_cpu_coherent(nvkm_device(&drm->device)))
+		nvbo->force_coherent = flags & TTM_PL_FLAG_UNCACHED;
+
 	nvbo->page_shift = 12;
 	if (drm->client.vm) {
 		if (!(flags & TTM_PL_FLAG_TT) && size > 256 * 1024)
@@ -291,7 +294,8 @@ void
 nouveau_bo_placement_set(struct nouveau_bo *nvbo, uint32_t type, uint32_t busy)
 {
 	struct ttm_placement *pl = &nvbo->placement;
-	uint32_t flags = TTM_PL_MASK_CACHING |
+	uint32_t flags = (nvbo->force_coherent ? TTM_PL_FLAG_UNCACHED :
+						 TTM_PL_MASK_CACHING) |
 			 (nvbo->pin_refcnt ? TTM_PL_FLAG_NO_EVICT : 0);

 	pl->placement = nvbo->placements;
@@ -396,7 +400,14 @@ nouveau_bo_map(struct nouveau_bo *nvbo)
 	if (ret)
 		return ret;

-	ret = ttm_bo_kmap(&nvbo->bo, 0, nvbo->bo.mem.num_pages, &nvbo->kmap);
+	/*
+	 * TTM buffers allocated using the DMA API already have a mapping, let's
+	 * use it instead.
+	 */
+	if (!nvbo->force_coherent)
+		ret = ttm_bo_kmap(&nvbo->bo, 0, nvbo->bo.mem.num_pages,
+				  &nvbo->kmap);
+
 	ttm_bo_unreserve(&nvbo->bo);
 	return ret;
 }
@@ -404,7 +415,14 @@ nouveau_bo_map(struct nouveau_bo *nvbo)
 void
 nouveau_bo_unmap(struct nouveau_bo *nvbo)
 {
-	if (nvbo)
+	if (!nvbo)
+		return;
+
+	/*
+	 * TTM buffers allocated using the DMA API already had a coherent
+	 * mapping which we used, no need to unmap.
+	 */
+	if (!nvbo->force_coherent)
 		ttm_bo_kunmap(&nvbo->kmap);
 }

@@ -422,12 +440,36 @@ nouveau_bo_validate(struct nouveau_bo *nvbo, bool interruptible,
 	return 0;
 }

+static inline void *
+_nouveau_bo_mem_index(struct nouveau_bo *nvbo, unsigned index, void *mem, u8 sz)
+{
+	struct ttm_dma_tt *dma_tt;
+	u8 *m = mem;
+
+	index *= sz;
+
+	if (m) {
+		/* kmap'd address, return the corresponding offset */
+		m += index;
+	} else {
+		/* DMA-API mapping, lookup the right address */
+		dma_tt = (struct ttm_dma_tt *)nvbo->bo.ttm;
+		m = dma_tt->cpu_address[index / PAGE_SIZE];
+		m += index % PAGE_SIZE;
+	}
+
+	return m;
+}
+#define nouveau_bo_mem_index(o, i, m) _nouveau_bo_mem_index(o, i, m, sizeof(*m))
+
 u16
 nouveau_bo_rd16(struct nouveau_bo *nvbo, unsigned index)
 {
 	bool is_iomem;
 	u16 *mem = ttm_kmap_obj_virtual(&nvbo->kmap, &is_iomem);
-	mem = &mem[index];
+
+	mem = nouveau_bo_mem_index(nvbo, index, mem);
+
 	if (is_iomem)
 		return ioread16_native((void __force __iomem *)mem);
 	else
@@ -439,7 +481,9 @@ nouveau_bo_wr16(struct nouveau_bo *nvbo, unsigned index, u16 val)
 {
 	bool is_iomem;
 	u16 *mem = ttm_kmap_obj_virtual(&nvbo->kmap, &is_iomem);
-	mem = &mem[index];
+
+	mem = nouveau_bo_mem_index(nvbo, index, mem);
+
 	if (is_iomem)
 		iowrite16_native(val, (void __force __iomem *)mem);
 	else
@@ -451,7 +495,9 @@ nouveau_bo_rd32(struct nouveau_bo *nvbo, unsigned index)
 {
 	bool is_iomem;
 	u32 *mem = ttm_kmap_obj_virtual(&nvbo->kmap, &is_iomem);
-	mem = &mem[index];
+
+	mem = nouveau_bo_mem_index(nvbo, index, mem);
+
 	if (is_iomem)
 		return ioread32_native((void __force __iomem *)mem);
 	else
@@ -463,7 +509,9 @@ nouveau_bo_wr32(struct nouveau_bo *nvbo, unsigned index, u32 val)
 {
 	bool is_iomem;
 	u32 *mem = ttm_kmap_obj_virtual(&nvbo->kmap, &is_iomem);
-	mem = &mem[index];
+
+	mem = nouveau_bo_mem_index(nvbo, index, mem);
+
 	if (is_iomem)
 		iowrite32_native(val, (void __force __iomem *)mem);
 	else
@@ -1383,6 +1431,14 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
 	dev = drm->dev;
 	pdev = nv_device_base(device);

+	/*
+	 * Objects matching this condition have been marked as force_coherent,
+	 * so use the DMA API for them.
+	 */
+	if (!nv_device_is_cpu_coherent(device) &&
+	    ttm->caching_state == tt_uncached)
+		return ttm_dma_populate(ttm_dma, dev->dev);
+
 #if __OS_HAS_AGP
 	if (drm->agp.stat == ENABLED) {
 		return ttm_agp_tt_populate(ttm);
@@ -1440,6 +1496,14 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm)
 	dev = drm->dev;
 	pdev = nv_device_base(device);

+	/*
+	 * Objects matching this condition have been marked as force_coherent,
+	 * so use the DMA API for them.
+	 */
+	if (!nv_device_is_cpu_coherent(device) &&
+	    ttm->caching_state == tt_uncached)
+		ttm_dma_unpopulate(ttm_dma, dev->dev);
+
 #if __OS_HAS_AGP
 	if (drm->agp.stat == ENABLED) {
 		ttm_agp_tt_unpopulate(ttm);

--- a/drivers/gpu/drm/nouveau/nouveau_bo.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
@@ -13,6 +13,7 @@ struct nouveau_bo {
 	u32 valid_domains;
 	struct ttm_place placements[3];
 	struct ttm_place busy_placements[3];
+	bool force_coherent;
 	struct ttm_bo_kmap_obj kmap;
 	struct list_head head;