Commit cf9caf19 authored by Michael Zoran's avatar Michael Zoran Committed by Greg Kroah-Hartman

staging: vc04_services: Replace dmac_map_area with dmac_map_sg

The original arm implementation uses dmac_map_area which is not
portable.  Replace it with an architecture neutral version
which uses dma_map_sg.

As you can see that for larger page sizes, the dma_map_sg
implementation is faster then the original unportable dma_map_area
implementation.

Test                       dmac_map_area   dma_map_page dma_map_sg
vchiq_test -b 4 10000      51us/iter       76us/iter    76us
vchiq_test -b 8 10000      70us/iter       82us/iter    91us
vchiq_test -b 16 10000     94us/iter       118us/iter   121us
vchiq_test -b 32 10000     146us/iter      173us/iter   187us
vchiq_test -b 64 10000     263us/iter      328us/iter   299us
vchiq_test -b 128 10000    529us/iter      631us/iter   595us
vchiq_test -b 256 10000    2285us/iter     2275us/iter  2001us
vchiq_test -b 512 10000    4372us/iter     4616us/iter  4123us

For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.

For message size >= 256KB, the dma_map_sg is the fastest
implementation.

"Normal" messages sizes should be about 1MB which is beyond
the length that this change shows a speed increase.

This is v2 of the patch which includes extra WARN_ONs and
incorporates feedback from Eric Anholt <eric@anholt.net>.
Signed-off-by: default avatarMichael Zoran <mzoran@crowfest.net>
Reviewed-by: default avatarEric Anholt <eric@anholt.net>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent c4b370ef
...@@ -45,13 +45,8 @@ ...@@ -45,13 +45,8 @@
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <soc/bcm2835/raspberrypi-firmware.h> #include <soc/bcm2835/raspberrypi-firmware.h>
#define dmac_map_area __glue(_CACHE,_dma_map_area)
#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
#define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32) #define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32)
#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset))
#include "vchiq_arm.h" #include "vchiq_arm.h"
#include "vchiq_2835.h" #include "vchiq_2835.h"
#include "vchiq_connected.h" #include "vchiq_connected.h"
...@@ -73,7 +68,7 @@ static unsigned int g_fragments_size; ...@@ -73,7 +68,7 @@ static unsigned int g_fragments_size;
static char *g_fragments_base; static char *g_fragments_base;
static char *g_free_fragments; static char *g_free_fragments;
static struct semaphore g_free_fragments_sema; static struct semaphore g_free_fragments_sema;
static unsigned long g_virt_to_bus_offset; static struct device *g_dev;
extern int vchiq_arm_log_level; extern int vchiq_arm_log_level;
...@@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id); ...@@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id);
static int static int
create_pagelist(char __user *buf, size_t count, unsigned short type, create_pagelist(char __user *buf, size_t count, unsigned short type,
struct task_struct *task, PAGELIST_T ** ppagelist); struct task_struct *task, PAGELIST_T **ppagelist,
dma_addr_t *dma_addr);
static void static void
free_pagelist(PAGELIST_T *pagelist, int actual); free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual);
int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
{ {
...@@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) ...@@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
int slot_mem_size, frag_mem_size; int slot_mem_size, frag_mem_size;
int err, irq, i; int err, irq, i;
g_virt_to_bus_offset = virt_to_dma(dev, (void *)0);
(void)of_property_read_u32(dev->of_node, "cache-line-size", (void)of_property_read_u32(dev->of_node, "cache-line-size",
&g_cache_line_size); &g_cache_line_size);
g_fragments_size = 2 * g_cache_line_size; g_fragments_size = 2 * g_cache_line_size;
...@@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) ...@@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
return -ENOMEM; return -ENOMEM;
} }
WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) != 0); WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) != 0);
vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size); vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size);
if (!vchiq_slot_zero) if (!vchiq_slot_zero)
...@@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) ...@@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
return err ? : -ENXIO; return err ? : -ENXIO;
} }
g_dev = dev;
vchiq_log_info(vchiq_arm_log_level, vchiq_log_info(vchiq_arm_log_level,
"vchiq_init - done (slots %pK, phys %pad)", "vchiq_init - done (slots %pK, phys %pad)",
vchiq_slot_zero, &slot_phys); vchiq_slot_zero, &slot_phys);
...@@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle, ...@@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
{ {
PAGELIST_T *pagelist; PAGELIST_T *pagelist;
int ret; int ret;
dma_addr_t dma_addr;
WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID); WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID);
...@@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle, ...@@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
? PAGELIST_READ ? PAGELIST_READ
: PAGELIST_WRITE, : PAGELIST_WRITE,
current, current,
&pagelist); &pagelist,
&dma_addr);
if (ret != 0) if (ret != 0)
return VCHIQ_ERROR; return VCHIQ_ERROR;
bulk->handle = memhandle; bulk->handle = memhandle;
bulk->data = VCHIQ_ARM_ADDRESS(pagelist); bulk->data = (void *)(unsigned long)dma_addr;
/* Store the pagelist address in remote_data, which isn't used by the /* Store the pagelist address in remote_data, which isn't used by the
slave. */ slave. */
...@@ -259,7 +257,8 @@ void ...@@ -259,7 +257,8 @@ void
vchiq_complete_bulk(VCHIQ_BULK_T *bulk) vchiq_complete_bulk(VCHIQ_BULK_T *bulk)
{ {
if (bulk && bulk->remote_data && bulk->actual) if (bulk && bulk->remote_data && bulk->actual)
free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual); free_pagelist((dma_addr_t)(unsigned long)bulk->data,
(PAGELIST_T *)bulk->remote_data, bulk->actual);
} }
void void
...@@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id) ...@@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id)
** obscuring the new data underneath. We can solve this by transferring the ** obscuring the new data underneath. We can solve this by transferring the
** partial cache lines separately, and allowing the ARM to copy into the ** partial cache lines separately, and allowing the ARM to copy into the
** cached area. ** cached area.
** N.B. This implementation plays slightly fast and loose with the Linux
** driver programming rules, e.g. its use of dmac_map_area instead of
** dma_map_single, but it isn't a multi-platform driver and it benefits
** from increased speed as a result.
*/ */
static int static int
create_pagelist(char __user *buf, size_t count, unsigned short type, create_pagelist(char __user *buf, size_t count, unsigned short type,
struct task_struct *task, PAGELIST_T ** ppagelist) struct task_struct *task, PAGELIST_T **ppagelist,
dma_addr_t *dma_addr)
{ {
PAGELIST_T *pagelist; PAGELIST_T *pagelist;
struct page **pages; struct page **pages;
unsigned long *addrs; u32 *addrs;
unsigned int num_pages, offset, i; unsigned int num_pages, offset, i, k;
char *addr, *base_addr, *next_addr; int actual_pages;
int run, addridx, actual_pages;
unsigned long *need_release; unsigned long *need_release;
size_t pagelist_size;
struct scatterlist *scatterlist, *sg;
int dma_buffers;
int dir;
offset = (unsigned int)buf & (PAGE_SIZE - 1); offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1));
num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE; num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE;
pagelist_size = sizeof(PAGELIST_T) +
(num_pages * sizeof(unsigned long)) +
sizeof(unsigned long) +
(num_pages * sizeof(pages[0]) +
(num_pages * sizeof(struct scatterlist)));
*ppagelist = NULL; *ppagelist = NULL;
dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
/* Allocate enough storage to hold the page pointers and the page /* Allocate enough storage to hold the page pointers and the page
** list ** list
*/ */
pagelist = kmalloc(sizeof(PAGELIST_T) + pagelist = dma_zalloc_coherent(g_dev,
(num_pages * sizeof(unsigned long)) + pagelist_size,
sizeof(unsigned long) + dma_addr,
(num_pages * sizeof(pages[0])), GFP_KERNEL);
GFP_KERNEL);
vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK", vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK",
pagelist); pagelist);
...@@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
addrs = pagelist->addrs; addrs = pagelist->addrs;
need_release = (unsigned long *)(addrs + num_pages); need_release = (unsigned long *)(addrs + num_pages);
pages = (struct page **)(addrs + num_pages + 1); pages = (struct page **)(addrs + num_pages + 1);
scatterlist = (struct scatterlist *)(pages + num_pages);
if (is_vmalloc_addr(buf)) { if (is_vmalloc_addr(buf)) {
int dir = (type == PAGELIST_WRITE) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE;
unsigned long length = count; unsigned long length = count;
unsigned int off = offset; unsigned int off = offset;
...@@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
if (bytes > length) if (bytes > length)
bytes = length; bytes = length;
pages[actual_pages] = pg; pages[actual_pages] = pg;
dmac_map_area(page_address(pg) + off, bytes, dir);
length -= bytes; length -= bytes;
off = 0; off = 0;
} }
...@@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
actual_pages--; actual_pages--;
put_page(pages[actual_pages]); put_page(pages[actual_pages]);
} }
kfree(pagelist); dma_free_coherent(g_dev, pagelist_size,
pagelist, *dma_addr);
if (actual_pages == 0) if (actual_pages == 0)
actual_pages = -ENOMEM; actual_pages = -ENOMEM;
return actual_pages; return actual_pages;
...@@ -450,30 +454,44 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -450,30 +454,44 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
pagelist->type = type; pagelist->type = type;
pagelist->offset = offset; pagelist->offset = offset;
/* Group the pages into runs of contiguous pages */ for (i = 0; i < num_pages; i++)
sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0);
dma_buffers = dma_map_sg(g_dev,
scatterlist,
num_pages,
dir);
base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0])); if (dma_buffers == 0) {
next_addr = base_addr + PAGE_SIZE; dma_free_coherent(g_dev, pagelist_size,
addridx = 0; pagelist, *dma_addr);
run = 0;
for (i = 1; i < num_pages; i++) { return -EINTR;
addr = VCHIQ_ARM_ADDRESS(page_address(pages[i])); }
if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) {
next_addr += PAGE_SIZE; /* Combine adjacent blocks for performance */
run++; k = 0;
for_each_sg(scatterlist, sg, dma_buffers, i) {
u32 len = sg_dma_len(sg);
u32 addr = sg_dma_address(sg);
/* Note: addrs is the address + page_count - 1
* The firmware expects the block to be page
* aligned and a multiple of the page size
*/
WARN_ON(len == 0);
WARN_ON(len & ~PAGE_MASK);
WARN_ON(addr & ~PAGE_MASK);
if (k > 0 &&
((addrs[k - 1] & PAGE_MASK) |
((addrs[k - 1] & ~PAGE_MASK) + 1) << PAGE_SHIFT)
== addr) {
addrs[k - 1] += (len >> PAGE_SHIFT);
} else { } else {
addrs[addridx] = (unsigned long)base_addr + run; addrs[k++] = addr | ((len >> PAGE_SHIFT) - 1);
addridx++;
base_addr = addr;
next_addr = addr + PAGE_SIZE;
run = 0;
} }
} }
addrs[addridx] = (unsigned long)base_addr + run;
addridx++;
/* Partial cache lines (fragments) require special measures */ /* Partial cache lines (fragments) require special measures */
if ((type == PAGELIST_READ) && if ((type == PAGELIST_READ) &&
((pagelist->offset & (g_cache_line_size - 1)) || ((pagelist->offset & (g_cache_line_size - 1)) ||
...@@ -482,7 +500,10 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -482,7 +500,10 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
char *fragments; char *fragments;
if (down_interruptible(&g_free_fragments_sema) != 0) { if (down_interruptible(&g_free_fragments_sema) != 0) {
kfree(pagelist); dma_unmap_sg(g_dev, scatterlist, num_pages,
DMA_BIDIRECTIONAL);
dma_free_coherent(g_dev, pagelist_size,
pagelist, *dma_addr);
return -EINTR; return -EINTR;
} }
...@@ -497,29 +518,42 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, ...@@ -497,29 +518,42 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
(fragments - g_fragments_base) / g_fragments_size; (fragments - g_fragments_base) / g_fragments_size;
} }
dmac_flush_range(pagelist, addrs + num_pages);
*ppagelist = pagelist; *ppagelist = pagelist;
return 0; return 0;
} }
static void static void
free_pagelist(PAGELIST_T *pagelist, int actual) free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual)
{ {
unsigned long *need_release; unsigned long *need_release;
struct page **pages; struct page **pages;
unsigned int num_pages, i; unsigned int num_pages, i;
size_t pagelist_size;
struct scatterlist *scatterlist;
int dir;
vchiq_log_trace(vchiq_arm_log_level, "free_pagelist - %pK, %d", vchiq_log_trace(vchiq_arm_log_level, "free_pagelist - %pK, %d",
pagelist, actual); pagelist, actual);
dir = (pagelist->type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
DMA_FROM_DEVICE;
num_pages = num_pages =
(pagelist->length + pagelist->offset + PAGE_SIZE - 1) / (pagelist->length + pagelist->offset + PAGE_SIZE - 1) /
PAGE_SIZE; PAGE_SIZE;
pagelist_size = sizeof(PAGELIST_T) +
(num_pages * sizeof(unsigned long)) +
sizeof(unsigned long) +
(num_pages * sizeof(pages[0]) +
(num_pages * sizeof(struct scatterlist)));
need_release = (unsigned long *)(pagelist->addrs + num_pages); need_release = (unsigned long *)(pagelist->addrs + num_pages);
pages = (struct page **)(pagelist->addrs + num_pages + 1); pages = (struct page **)(pagelist->addrs + num_pages + 1);
scatterlist = (struct scatterlist *)(pages + num_pages);
dma_unmap_sg(g_dev, scatterlist, num_pages, dir);
/* Deal with any partial cache lines (fragments) */ /* Deal with any partial cache lines (fragments) */
if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) { if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) {
...@@ -569,8 +603,7 @@ free_pagelist(PAGELIST_T *pagelist, int actual) ...@@ -569,8 +603,7 @@ free_pagelist(PAGELIST_T *pagelist, int actual)
if (bytes > length) if (bytes > length)
bytes = length; bytes = length;
dmac_unmap_area(page_address(pg) + offset,
bytes, DMA_FROM_DEVICE);
length -= bytes; length -= bytes;
offset = 0; offset = 0;
set_page_dirty(pg); set_page_dirty(pg);
...@@ -579,5 +612,6 @@ free_pagelist(PAGELIST_T *pagelist, int actual) ...@@ -579,5 +612,6 @@ free_pagelist(PAGELIST_T *pagelist, int actual)
} }
} }
kfree(pagelist); dma_free_coherent(g_dev, pagelist_size,
pagelist, dma_addr);
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment