Commit f25cbb7a authored by Alex Sierra's avatar Alex Sierra Committed by akpm

mm: add zone device coherent type memory support

Device memory that is cache coherent from device and CPU point of view. 
This is used on platforms that have an advanced system bus (like CAPI or
CXL).  Any page of a process can be migrated to such memory.  However, no
one should be allowed to pin such memory so that it can always be evicted.

[hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page]
Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.comSigned-off-by: default avatarAlex Sierra <alex.sierra@amd.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Acked-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarAlistair Popple <apopple@nvidia.com>
Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 5bb88dc5
...@@ -41,6 +41,13 @@ struct vmem_altmap { ...@@ -41,6 +41,13 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in * A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/mm/hmm.rst. * include/linux/hmm.h and Documentation/mm/hmm.rst.
* *
* MEMORY_DEVICE_COHERENT:
* Device memory that is cache coherent from device and CPU point of view. This
* is used on platforms that have an advanced system bus (like CAPI or CXL). A
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
* type. Any page of a process can be migrated to such memory. However no one
* should be allowed to pin such memory so that it can always be evicted.
*
* MEMORY_DEVICE_FS_DAX: * MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA * Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page * coherent and supports page pinning. In support of coordinating page
...@@ -61,6 +68,7 @@ struct vmem_altmap { ...@@ -61,6 +68,7 @@ struct vmem_altmap {
enum memory_type { enum memory_type {
/* 0 is reserved to catch uninitialized type fields */ /* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_PRIVATE = 1,
MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA, MEMORY_DEVICE_PCI_P2PDMA,
...@@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page) ...@@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
} }
static inline bool is_device_coherent_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_COHERENT;
}
static inline bool folio_is_device_coherent(const struct folio *folio)
{
return is_device_coherent_page(&folio->page);
}
#ifdef CONFIG_ZONE_DEVICE #ifdef CONFIG_ZONE_DEVICE
void *memremap_pages(struct dev_pagemap *pgmap, int nid); void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap); void memunmap_pages(struct dev_pagemap *pgmap);
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/pgtable.h> #include <linux/pgtable.h>
#include <linux/kasan.h> #include <linux/kasan.h>
#include <linux/memremap.h>
struct mempolicy; struct mempolicy;
struct anon_vma; struct anon_vma;
...@@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page) ...@@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page)
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
return false; return false;
#endif #endif
return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); return !(is_device_coherent_page(page) ||
is_zone_movable_page(page) ||
is_zero_pfn(page_to_pfn(page)));
} }
#else #else
static inline bool is_longterm_pinnable_page(struct page *page) static inline bool is_longterm_pinnable_page(struct page *page)
......
...@@ -5716,8 +5716,8 @@ static int mem_cgroup_move_account(struct page *page, ...@@ -5716,8 +5716,8 @@ static int mem_cgroup_move_account(struct page *page,
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored * target for charge migration. if @target is not NULL, the entry is stored
* in target->ent. * in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
* (so ZONE_DEVICE page and thus not on the lru). * thus not on the lru.
* For now we such page is charge like a regular page would be as for all * For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a * intent and purposes it is just special memory taking the place of a
* regular page. * regular page.
...@@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, ...@@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/ */
if (page_memcg(page) == mc.from) { if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE; ret = MC_TARGET_PAGE;
if (is_device_private_page(page)) if (is_device_private_page(page) ||
is_device_coherent_page(page))
ret = MC_TARGET_DEVICE; ret = MC_TARGET_DEVICE;
if (target) if (target)
target->page = page; target->page = page;
......
...@@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, ...@@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock; goto unlock;
} }
if (pgmap->type == MEMORY_DEVICE_PRIVATE) { switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_COHERENT:
/* /*
* TODO: Handle HMM pages which may need coordination * TODO: Handle device pages which may need coordination
* with device-side memory. * with device-side memory.
*/ */
goto unlock; goto unlock;
default:
break;
} }
/* /*
......
...@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) ...@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
break; break;
case MEMORY_DEVICE_COHERENT:
if (!pgmap->ops->page_free) {
WARN(1, "Missing page_free method\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->owner) {
WARN(1, "Missing owner\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_FS_DAX: case MEMORY_DEVICE_FS_DAX:
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n"); WARN(1, "File system DAX not supported\n");
......
...@@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup); ...@@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
* handle_pte_fault() * handle_pte_fault()
* do_anonymous_page() * do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
* private page. * private or coherent page.
*/ */
static void migrate_vma_insert_page(struct migrate_vma *migrate, static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr, unsigned long addr,
...@@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ...@@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
page_to_pfn(page)); page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry); entry = swp_entry_to_pte(swp_entry);
} else { } else {
/* if (is_zone_device_page(page) &&
* For now we only support migrating to un-addressable device !is_device_coherent_page(page)) {
* memory.
*/
if (is_zone_device_page(page)) {
pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort; goto abort;
} }
...@@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate) ...@@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page); mapping = page_mapping(page);
if (is_device_private_page(newpage)) { if (is_device_private_page(newpage) ||
is_device_coherent_page(newpage)) {
/* /*
* For now only support private anonymous when migrating * For now only support anonymous memory migrating to
* to un-addressable device memory. * device private or coherent memory.
*/ */
if (mapping) { if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
......
...@@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, ...@@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */ /* Update high watermark before we lower rss */
update_hiwater_rss(mm); update_hiwater_rss(mm);
if (folio_is_zone_device(folio)) { if (folio_is_device_private(folio)) {
unsigned long pfn = folio_pfn(folio); unsigned long pfn = folio_pfn(folio);
swp_entry_t entry; swp_entry_t entry;
pte_t swp_pte; pte_t swp_pte;
...@@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) ...@@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
TTU_SYNC))) TTU_SYNC)))
return; return;
if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) if (folio_is_zone_device(folio) &&
(!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
return; return;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment