Commit b15cd800 authored by Matthew Wilcox's avatar Matthew Wilcox

dax: Convert page fault handlers to XArray

This is the last part of DAX to be converted to the XArray so
remove all the old helper functions.
Signed-off-by: default avatarMatthew Wilcox <willy@infradead.org>
parent 9f32d221
...@@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry) ...@@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT; return xa_to_value(entry) >> DAX_SHIFT;
} }
static void *dax_make_locked(unsigned long pfn, unsigned long flags)
{
return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) |
DAX_LOCKED);
}
static void *dax_make_entry(pfn_t pfn, unsigned long flags) static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{ {
return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
...@@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue { ...@@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key; struct exceptional_entry_key key;
}; };
static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
pgoff_t index, void *entry, struct exceptional_entry_key *key) void *entry, struct exceptional_entry_key *key)
{ {
unsigned long hash; unsigned long hash;
unsigned long index = xas->xa_index;
/* /*
* If 'entry' is a PMD, align the 'index' that we use for the wait * If 'entry' is a PMD, align the 'index' that we use for the wait
...@@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, ...@@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa,
*/ */
if (dax_is_pmd_entry(entry)) if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR; index &= ~PG_PMD_COLOUR;
key->xa = xas->xa;
key->xa = xa;
key->entry_start = index; key->entry_start = index;
hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash; return wait_table + hash;
} }
...@@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, ...@@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
* The important information it's conveying is whether the entry at * The important information it's conveying is whether the entry at
* this index used to be a PMD entry. * this index used to be a PMD entry.
*/ */
static void dax_wake_mapping_entry_waiter(struct xarray *xa, static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
pgoff_t index, void *entry, bool wake_all)
{ {
struct exceptional_entry_key key; struct exceptional_entry_key key;
wait_queue_head_t *wq; wait_queue_head_t *wq;
wq = dax_entry_waitqueue(xa, index, entry, &key); wq = dax_entry_waitqueue(xas, entry, &key);
/* /*
* Checking for locked entry and prepare_to_wait_exclusive() happens * Checking for locked entry and prepare_to_wait_exclusive() happens
...@@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, ...@@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa,
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
} }
static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry,
wake_all);
}
/* /*
* Look up entry in page cache, wait for it to become unlocked if it * Look up entry in page cache, wait for it to become unlocked if it
* is a DAX entry and return it. The caller must subsequently call * is a DAX entry and return it. The caller must subsequently call
...@@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas) ...@@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas)
!dax_is_locked(entry)) !dax_is_locked(entry))
return entry; return entry;
wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, wq = dax_entry_waitqueue(xas, entry, &ewait.key);
&ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait, prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
xas_unlock_irq(xas); xas_unlock_irq(xas);
...@@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry) ...@@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry)
return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
} }
/*
* Check whether the given slot is locked. Must be called with the i_pages
* lock held.
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
unsigned long entry = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
return entry & DAX_LOCKED;
}
/*
* Mark the given slot as locked. Must be called with the i_pages lock held.
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
unsigned long v = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
void *entry = xa_mk_value(v | DAX_LOCKED);
radix_tree_replace_slot(&mapping->i_pages, slot, entry);
return entry;
}
/*
* Mark the given slot as unlocked. Must be called with the i_pages lock held.
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
unsigned long v = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
void *entry = xa_mk_value(v & ~DAX_LOCKED);
radix_tree_replace_slot(&mapping->i_pages, slot, entry);
return entry;
}
/*
* Lookup entry in page cache, wait for it to become unlocked if it is
* a DAX entry and return it. The caller must call
* put_unlocked_mapping_entry() when he decided not to lock the entry or
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
*
* Must be called with the i_pages lock held.
*/
static void *__get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp, bool (*wait_fn)(void))
{
void *entry, **slot;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
bool revalidate;
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
&slot);
if (!entry ||
WARN_ON_ONCE(!xa_is_value(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
return entry;
}
wq = dax_entry_waitqueue(&mapping->i_pages, index, entry,
&ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
xa_unlock_irq(&mapping->i_pages);
revalidate = wait_fn();
finish_wait(wq, &ewait.wait);
xa_lock_irq(&mapping->i_pages);
if (revalidate)
return ERR_PTR(-EAGAIN);
}
}
static bool entry_wait(void)
{
schedule();
/*
* Never return an ERR_PTR() from
* __get_unlocked_mapping_entry(), just keep looping.
*/
return false;
}
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
{
return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
}
static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
{
void *entry, **slot;
xa_lock_irq(&mapping->i_pages);
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !xa_is_value(entry) ||
!slot_locked(mapping, slot))) {
xa_unlock_irq(&mapping->i_pages);
return;
}
unlock_slot(mapping, slot);
xa_unlock_irq(&mapping->i_pages);
dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
}
static void put_locked_mapping_entry(struct address_space *mapping,
pgoff_t index)
{
unlock_mapping_entry(mapping, index);
}
/*
* Called when we are done with page cache entry we looked up via
* get_unlocked_mapping_entry() and which we didn't lock in the end.
*/
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
if (!entry)
return;
/* We have to wake up next waiter for the page cache entry lock */
dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
}
static unsigned long dax_entry_size(void *entry) static unsigned long dax_entry_size(void *entry)
{ {
if (dax_is_zero_entry(entry)) if (dax_is_zero_entry(entry))
...@@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page) ...@@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page)
* that index, add a locked empty entry. * that index, add a locked empty entry.
* *
* When requesting an entry with size DAX_PMD, grab_mapping_entry() will * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will * either return that locked entry or will return VM_FAULT_FALLBACK.
* happen if there are any 4k entries within the 2MiB range that we are * This will happen if there are any PTE entries within the PMD range
* requesting. * that we are requesting.
* *
* We always favor 4k entries over 2MiB entries. There isn't a flow where we * We always favor PTE entries over PMD entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
* insertion will fail if it finds any 4k entries already in the tree, and a * insertion will fail if it finds any PTE entries already in the tree, and a
* 4k insertion will cause an existing 2MiB entry to be unmapped and * PTE insertion will cause an existing PMD entry to be unmapped and
* downgraded to 4k entries. This happens for both 2MiB huge zero pages as * downgraded to PTE entries. This happens for both PMD zero pages as
* well as 2MiB empty entries. * well as PMD empty entries.
* *
* The exception to this downgrade path is for 2MiB DAX PMD entries that have * The exception to this downgrade path is for PMD entries that have
* real storage backing them. We will leave these real 2MiB DAX entries in * real storage backing them. We will leave these real PMD entries in
* the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. * the tree, and PTE writes will simply dirty the entire PMD entry.
* *
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can * persistent memory the benefit is doubtful. We can add that later if we can
* show it helps. * show it helps.
*
* On error, this function does not return an ERR_PTR. Instead it returns
* a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
* overlap with xarray value entries.
*/ */
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, static void *grab_mapping_entry(struct xa_state *xas,
unsigned long size_flag) struct address_space *mapping, unsigned long size_flag)
{ {
bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ unsigned long index = xas->xa_index;
void *entry, **slot; bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
void *entry;
restart:
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { retry:
entry = ERR_PTR(-EIO); xas_lock_irq(xas);
goto out_unlock; entry = get_unlocked_entry(xas);
} if (xa_is_internal(entry))
goto fallback;
if (entry) { if (entry) {
if (WARN_ON_ONCE(!xa_is_value(entry))) {
xas_set_err(xas, EIO);
goto out_unlock;
}
if (size_flag & DAX_PMD) { if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) { if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index, put_unlocked_entry(xas, entry);
entry); goto fallback;
entry = ERR_PTR(-EEXIST);
goto out_unlock;
} }
} else { /* trying to grab a PTE entry */ } else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) && if (dax_is_pmd_entry(entry) &&
...@@ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, ...@@ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
} }
} }
/* No entry for given index? Make sure radix tree is big enough. */ if (pmd_downgrade) {
if (!entry || pmd_downgrade) { /*
int err; * Make sure 'entry' remains valid while we drop
* the i_pages lock.
if (pmd_downgrade) { */
/* dax_lock_entry(xas, entry);
* Make sure 'entry' remains valid while we drop
* the i_pages lock.
*/
entry = lock_slot(mapping, slot);
}
xa_unlock_irq(&mapping->i_pages);
/* /*
* Besides huge zero pages the only other thing that gets * Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be * downgraded are empty entries which don't need to be
* unmapped. * unmapped.
*/ */
if (pmd_downgrade && dax_is_zero_entry(entry)) if (dax_is_zero_entry(entry)) {
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, xas_unlock_irq(xas);
PG_PMD_NR, false); unmap_mapping_pages(mapping,
xas->xa_index & ~PG_PMD_COLOUR,
err = radix_tree_preload( PG_PMD_NR, false);
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); xas_reset(xas);
if (err) { xas_lock_irq(xas);
if (pmd_downgrade)
put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
xa_lock_irq(&mapping->i_pages);
if (!entry) {
/*
* We needed to drop the i_pages lock while calling
* radix_tree_preload() and we didn't have an entry to
* lock. See if another thread inserted an entry at
* our index during this time.
*/
entry = __radix_tree_lookup(&mapping->i_pages, index,
NULL, &slot);
if (entry) {
radix_tree_preload_end();
xa_unlock_irq(&mapping->i_pages);
goto restart;
}
} }
if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false);
dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */
radix_tree_delete(&mapping->i_pages, index); dax_wake_entry(xas, entry, true);
mapping->nrexceptional--; mapping->nrexceptional--;
dax_wake_mapping_entry_waiter(&mapping->i_pages, entry = NULL;
index, entry, true); xas_set(xas, index);
} }
entry = dax_make_locked(0, size_flag | DAX_EMPTY); if (entry) {
dax_lock_entry(xas, entry);
err = __radix_tree_insert(&mapping->i_pages, index, } else {
dax_entry_order(entry), entry); entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
radix_tree_preload_end(); dax_lock_entry(xas, entry);
if (err) { if (xas_error(xas))
xa_unlock_irq(&mapping->i_pages); goto out_unlock;
/*
* Our insertion of a DAX entry failed, most likely
* because we were inserting a PMD entry and it
* collided with a PTE sized entry at a different
* index in the PMD range. We haven't inserted
* anything into the radix tree and have no waiters to
* wake.
*/
return ERR_PTR(err);
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++; mapping->nrexceptional++;
xa_unlock_irq(&mapping->i_pages);
return entry;
} }
entry = lock_slot(mapping, slot);
out_unlock: out_unlock:
xa_unlock_irq(&mapping->i_pages); xas_unlock_irq(xas);
if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
goto retry;
if (xas->xa_node == XA_ERROR(-ENOMEM))
return xa_mk_internal(VM_FAULT_OOM);
if (xas_error(xas))
return xa_mk_internal(VM_FAULT_SIGBUS);
return entry; return entry;
fallback:
xas_unlock_irq(xas);
return xa_mk_internal(VM_FAULT_FALLBACK);
} }
/** /**
...@@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, ...@@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
* already in the tree, we will skip the insertion and just dirty the PMD as * already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate. * appropriate.
*/ */
static void *dax_insert_entry(struct address_space *mapping, static void *dax_insert_entry(struct xa_state *xas,
struct vm_fault *vmf, struct address_space *mapping, struct vm_fault *vmf,
void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) void *entry, pfn_t pfn, unsigned long flags, bool dirty)
{ {
struct radix_tree_root *pages = &mapping->i_pages; void *new_entry = dax_make_entry(pfn, flags);
unsigned long pfn = pfn_t_to_pfn(pfn_t);
pgoff_t index = vmf->pgoff;
void *new_entry;
if (dirty) if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */ /* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry)) if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
PG_PMD_NR, false); PG_PMD_NR, false);
else /* pte entry */ else /* pte entry */
unmap_mapping_pages(mapping, vmf->pgoff, 1, false); unmap_mapping_pages(mapping, index, 1, false);
} }
xa_lock_irq(pages); xas_reset(xas);
new_entry = dax_make_locked(pfn, flags); xas_lock_irq(xas);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) { if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false); dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
...@@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping, ...@@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping,
* existing entry is a PMD, we will just leave the PMD in the * existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary. * tree and dirty it if necessary.
*/ */
struct radix_tree_node *node; void *old = dax_lock_entry(xas, new_entry);
void **slot; WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
void *ret; DAX_LOCKED));
ret = __radix_tree_lookup(pages, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(pages, node, slot,
new_entry, NULL);
entry = new_entry; entry = new_entry;
} else {
xas_load(xas); /* Walk the xa_state */
} }
if (dirty) if (dirty)
radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(pages); xas_unlock_irq(xas);
return entry; return entry;
} }
...@@ -1166,15 +990,16 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, ...@@ -1166,15 +990,16 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
* If this page is ever written to we will re-fault and change the mapping to * If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead. * point to real DAX storage instead.
*/ */
static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, static vm_fault_t dax_load_hole(struct xa_state *xas,
struct vm_fault *vmf) struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{ {
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address; unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret; vm_fault_t ret;
dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_ZERO_PAGE, false); DAX_ZERO_PAGE, false);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
...@@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping; struct address_space *mapping = vma->vm_file->f_mapping;
XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address; unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
...@@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (write && !vmf->cow_page) if (write && !vmf->cow_page)
flags |= IOMAP_WRITE; flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0); entry = grab_mapping_entry(&xas, mapping, 0);
if (IS_ERR(entry)) { if (xa_is_internal(entry)) {
ret = dax_fault_return(PTR_ERR(entry)); ret = xa_to_internal(entry);
goto out; goto out;
} }
...@@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0) if (error < 0)
goto error_finish_iomap; goto error_finish_iomap;
entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync); 0, write && !sync);
/* /*
...@@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (!write) { if (!write) {
ret = dax_load_hole(mapping, entry, vmf); ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap; goto finish_iomap;
} }
/*FALLTHRU*/ /*FALLTHRU*/
...@@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
} }
unlock_entry: unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff); dax_unlock_entry(&xas, entry);
out: out:
trace_dax_pte_fault_done(inode, vmf, ret); trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major; return ret | major;
} }
#ifdef CONFIG_FS_DAX_PMD #ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
void *entry) struct iomap *iomap, void **entry)
{ {
struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK; unsigned long pmd_addr = vmf->address & PMD_MASK;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct page *zero_page; struct page *zero_page;
void *ret = NULL;
spinlock_t *ptl; spinlock_t *ptl;
pmd_t pmd_entry; pmd_t pmd_entry;
pfn_t pfn; pfn_t pfn;
...@@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, ...@@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback; goto fallback;
pfn = page_to_pfn_t(zero_page); pfn = page_to_pfn_t(zero_page);
ret = dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE, false); DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
...@@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, ...@@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pmd_entry = pmd_mkhuge(pmd_entry); pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl); spin_unlock(ptl);
trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
fallback: fallback:
trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
...@@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping; struct address_space *mapping = vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK; unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE; bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync; bool sync;
...@@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK; vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 }; struct iomap iomap = { 0 };
pgoff_t max_pgoff, pgoff; pgoff_t max_pgoff;
void *entry; void *entry;
loff_t pos; loff_t pos;
int error; int error;
...@@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* supposed to hold locks serializing us with truncate / punch hole so * supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test. * this is a reliable test.
*/ */
pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
...@@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((pmd_addr + PMD_SIZE) > vma->vm_end) if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback; goto fallback;
if (pgoff >= max_pgoff) { if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS; result = VM_FAULT_SIGBUS;
goto out; goto out;
} }
/* If the PMD would extend beyond the file size */ /* If the PMD would extend beyond the file size */
if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
goto fallback; goto fallback;
/* /*
* grab_mapping_entry() will make sure we get a 2MiB empty entry, a * grab_mapping_entry() will make sure we get an empty PMD entry,
* 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page * a zero PMD entry or a DAX PMD. If it can't (because a PTE
* is already in the tree, for instance), it will return -EEXIST and * entry is already in the array, for instance), it will return
* we just fall back to 4k entries. * VM_FAULT_FALLBACK.
*/ */
entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
if (IS_ERR(entry)) if (xa_is_internal(entry)) {
result = xa_to_internal(entry);
goto fallback; goto fallback;
}
/* /*
* It is possible, particularly with mixed reads & writes to private * It is possible, particularly with mixed reads & writes to private
...@@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* setting up a mapping, so really we're using iomap_begin() as a way * setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block. * to look up our filesystem block.
*/ */
pos = (loff_t)pgoff << PAGE_SHIFT; pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error) if (error)
goto unlock_entry; goto unlock_entry;
...@@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0) if (error < 0)
goto finish_iomap; goto finish_iomap;
entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
DAX_PMD, write && !sync); DAX_PMD, write && !sync);
/* /*
...@@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_HOLE: case IOMAP_HOLE:
if (WARN_ON_ONCE(write)) if (WARN_ON_ONCE(write))
break; break;
result = dax_pmd_load_hole(vmf, &iomap, entry); result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
break; break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
...@@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ...@@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
&iomap); &iomap);
} }
unlock_entry: unlock_entry:
put_locked_mapping_entry(mapping, pgoff); dax_unlock_entry(&xas, entry);
fallback: fallback:
if (result == VM_FAULT_FALLBACK) { if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address); split_huge_pmd(vma, vmf->pmd, vmf->address);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment