Commit d10bcf94 authored by Shiraz Saleem's avatar Shiraz Saleem Committed by Jason Gunthorpe

RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs

Combine contiguous regions of PAGE_SIZE pages into single scatter list
entry while building the scatter table for a umem. This minimizes the
number of the entries in the scatter list and reduces the DMA mapping
overhead, particularly with the IOMMU.

Set default max_seg_size in core for IB devices to 2G and do not combine
if we exceed this limit.

Also, purge npages in struct ib_umem as we now DMA map the umem SGL with
sg_nents and npage computation is not needed. Drivers should now be using
ib_umem_num_pages(), so fix the last stragglers.

Move npages tracking to ib_umem_odp as ODP drivers still need it.
Suggested-by: default avatarJason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: default avatarMichael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: default avatarIra Weiny <ira.weiny@intel.com>
Acked-by: default avatarAdit Ranadive <aditr@vmware.com>
Signed-off-by: default avatarShiraz Saleem <shiraz.saleem@intel.com>
Tested-by: default avatarGal Pressman <galpress@amazon.com>
Tested-by: default avatarSelvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent c7252a65
...@@ -1089,6 +1089,9 @@ static void setup_dma_device(struct ib_device *device) ...@@ -1089,6 +1089,9 @@ static void setup_dma_device(struct ib_device *device)
WARN_ON_ONCE(!parent); WARN_ON_ONCE(!parent);
device->dma_device = parent; device->dma_device = parent;
} }
/* Setup default max segment size for all IB devices */
dma_set_max_seg_size(device->dma_device, SZ_2G);
} }
/* /*
......
...@@ -39,25 +39,22 @@ ...@@ -39,25 +39,22 @@
#include <linux/export.h> #include <linux/export.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h> #include <rdma/ib_umem_odp.h>
#include "uverbs.h" #include "uverbs.h"
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{ {
struct scatterlist *sg; struct sg_page_iter sg_iter;
struct page *page; struct page *page;
int i;
if (umem->nmap > 0) if (umem->nmap > 0)
ib_dma_unmap_sg(dev, umem->sg_head.sgl, ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
umem->npages,
DMA_BIDIRECTIONAL); DMA_BIDIRECTIONAL);
for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
page = sg_page_iter_page(&sg_iter);
page = sg_page(sg);
if (!PageDirty(page) && umem->writable && dirty) if (!PageDirty(page) && umem->writable && dirty)
set_page_dirty_lock(page); set_page_dirty_lock(page);
put_page(page); put_page(page);
...@@ -66,6 +63,69 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d ...@@ -66,6 +63,69 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
sg_free_table(&umem->sg_head); sg_free_table(&umem->sg_head);
} }
/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
*
* sg: current scatterlist entry
* page_list: array of npage struct page pointers
* npages: number of pages in page_list
* max_seg_sz: maximum segment size in bytes
* nents: [out] number of entries in the scatterlist
*
* Return new end of scatterlist
*/
static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
struct page **page_list,
unsigned long npages,
unsigned int max_seg_sz,
int *nents)
{
unsigned long first_pfn;
unsigned long i = 0;
bool update_cur_sg = false;
bool first = !sg_page(sg);
/* Check if new page_list is contiguous with end of previous page_list.
* sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
*/
if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
page_to_pfn(page_list[0])))
update_cur_sg = true;
while (i != npages) {
unsigned long len;
struct page *first_page = page_list[i];
first_pfn = page_to_pfn(first_page);
/* Compute the number of contiguous pages we have starting
* at i
*/
for (len = 0; i != npages &&
first_pfn + len == page_to_pfn(page_list[i]);
len++)
i++;
/* Squash N contiguous pages from page_list into current sge */
if (update_cur_sg &&
((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))) {
sg_set_page(sg, sg_page(sg),
sg->length + (len << PAGE_SHIFT), 0);
update_cur_sg = false;
continue;
}
/* Squash N contiguous pages into next sge or first sge */
if (!first)
sg = sg_next(sg);
(*nents)++;
sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
first = false;
}
return sg;
}
/** /**
* ib_umem_get - Pin and DMA map userspace memory. * ib_umem_get - Pin and DMA map userspace memory.
* *
...@@ -93,7 +153,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, ...@@ -93,7 +153,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
int ret; int ret;
int i; int i;
unsigned long dma_attrs = 0; unsigned long dma_attrs = 0;
struct scatterlist *sg, *sg_list_start; struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE; unsigned int gup_flags = FOLL_WRITE;
if (!udata) if (!udata)
...@@ -190,7 +250,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, ...@@ -190,7 +250,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
if (!umem->writable) if (!umem->writable)
gup_flags |= FOLL_FORCE; gup_flags |= FOLL_FORCE;
sg_list_start = umem->sg_head.sgl; sg = umem->sg_head.sgl;
while (npages) { while (npages) {
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
...@@ -203,28 +263,29 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, ...@@ -203,28 +263,29 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
goto umem_release; goto umem_release;
} }
umem->npages += ret;
cur_base += ret * PAGE_SIZE; cur_base += ret * PAGE_SIZE;
npages -= ret; npages -= ret;
sg = ib_umem_add_sg_table(sg, page_list, ret,
dma_get_max_seg_size(context->device->dma_device),
&umem->sg_nents);
/* Continue to hold the mmap_sem as vma_list access /* Continue to hold the mmap_sem as vma_list access
* needs to be protected. * needs to be protected.
*/ */
for_each_sg(sg_list_start, sg, ret, i) { for (i = 0; i < ret && umem->hugetlb; i++) {
if (vma_list && !is_vm_hugetlb_page(vma_list[i])) if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
umem->hugetlb = 0; umem->hugetlb = 0;
sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
} }
up_read(&mm->mmap_sem);
/* preparing for next loop */ up_read(&mm->mmap_sem);
sg_list_start = sg;
} }
sg_mark_end(sg);
umem->nmap = ib_dma_map_sg_attrs(context->device, umem->nmap = ib_dma_map_sg_attrs(context->device,
umem->sg_head.sgl, umem->sg_head.sgl,
umem->npages, umem->sg_nents,
DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL,
dma_attrs); dma_attrs);
...@@ -320,8 +381,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, ...@@ -320,8 +381,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
return -EINVAL; return -EINVAL;
} }
ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem),
offset + ib_umem_offset(umem)); dst, length, offset + ib_umem_offset(umem));
if (ret < 0) if (ret < 0)
return ret; return ret;
......
...@@ -526,7 +526,7 @@ static int ib_umem_odp_map_dma_single_page( ...@@ -526,7 +526,7 @@ static int ib_umem_odp_map_dma_single_page(
} }
umem_odp->dma_list[page_index] = dma_addr | access_mask; umem_odp->dma_list[page_index] = dma_addr | access_mask;
umem_odp->page_list[page_index] = page; umem_odp->page_list[page_index] = page;
umem->npages++; umem_odp->npages++;
} else if (umem_odp->page_list[page_index] == page) { } else if (umem_odp->page_list[page_index] == page) {
umem_odp->dma_list[page_index] |= access_mask; umem_odp->dma_list[page_index] |= access_mask;
} else { } else {
...@@ -752,7 +752,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, ...@@ -752,7 +752,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
} }
umem_odp->page_list[idx] = NULL; umem_odp->page_list[idx] = NULL;
umem_odp->dma_list[idx] = 0; umem_odp->dma_list[idx] = 0;
umem->npages--; umem_odp->npages--;
} }
} }
mutex_unlock(&umem_odp->umem_mutex); mutex_unlock(&umem_odp->umem_mutex);
......
...@@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ...@@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
ib_umem_odp_unmap_dma_pages(umem_odp, start, end); ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
if (unlikely(!umem->npages && mr->parent && if (unlikely(!umem_odp->npages && mr->parent &&
!umem_odp->dying)) { !umem_odp->dying)) {
WRITE_ONCE(umem_odp->dying, 1); WRITE_ONCE(umem_odp->dying, 1);
atomic_inc(&mr->parent->num_leaf_free); atomic_inc(&mr->parent->num_leaf_free);
......
...@@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
union pvrdma_cmd_resp rsp; union pvrdma_cmd_resp rsp;
struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
int ret; int ret, npages;
if (length == 0 || length > dev->dsr->caps.max_mr_size) { if (length == 0 || length > dev->dsr->caps.max_mr_size) {
dev_warn(&dev->pdev->dev, "invalid mem region length\n"); dev_warn(&dev->pdev->dev, "invalid mem region length\n");
...@@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_CAST(umem); return ERR_CAST(umem);
} }
if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { npages = ib_umem_num_pages(umem);
if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
umem->npages); npages);
ret = -EINVAL; ret = -EINVAL;
goto err_umem; goto err_umem;
} }
...@@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mmr.size = length; mr->mmr.size = length;
mr->umem = umem; mr->umem = umem;
ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false);
if (ret) { if (ret) {
dev_warn(&dev->pdev->dev, dev_warn(&dev->pdev->dev,
"could not allocate page directory\n"); "could not allocate page directory\n");
...@@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
cmd->length = length; cmd->length = length;
cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->pd_handle = to_vpd(pd)->pd_handle;
cmd->access_flags = access_flags; cmd->access_flags = access_flags;
cmd->nchunks = umem->npages; cmd->nchunks = npages;
cmd->pdir_dma = mr->pdir.dir_dma; cmd->pdir_dma = mr->pdir.dir_dma;
ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
......
...@@ -53,7 +53,7 @@ struct ib_umem { ...@@ -53,7 +53,7 @@ struct ib_umem {
struct work_struct work; struct work_struct work;
struct sg_table sg_head; struct sg_table sg_head;
int nmap; int nmap;
int npages; unsigned int sg_nents;
}; };
/* Returns the offset of the umem start relative to the first page. */ /* Returns the offset of the umem start relative to the first page. */
......
...@@ -69,6 +69,7 @@ struct ib_umem_odp { ...@@ -69,6 +69,7 @@ struct ib_umem_odp {
int notifiers_seq; int notifiers_seq;
int notifiers_count; int notifiers_count;
int npages;
/* Tree tracking */ /* Tree tracking */
struct umem_odp_node interval_tree; struct umem_odp_node interval_tree;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment