Commit ac401cc7 authored by Jan Kara's avatar Jan Kara Committed by Ross Zwisler

dax: New fault locking

Currently DAX page fault locking is racy.

CPU0 (write fault)		CPU1 (read fault)

__dax_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
				  get_block(inode, block, &bh, 0)
				    -> not mapped
  if (!buffer_mapped(&bh))
    if (vmf->flags & FAULT_FLAG_WRITE)
      get_block(inode, block, &bh, 1) -> allocates blocks
  if (page) -> no
				  if (!buffer_mapped(&bh))
				    if (vmf->flags & FAULT_FLAG_WRITE) {
				    } else {
				      dax_load_hole();
				    }
  dax_insert_mapping()

And we are in a situation where we fail in dax_radix_entry() with -EIO.

Another problem with the current DAX page fault locking is that there is
no race-free way to clear dirty tag in the radix tree. We can always
end up with clean radix tree and dirty data in CPU cache.

We fix the first problem by introducing locking of exceptional radix
tree entries in DAX mappings acting very similarly to page lock and thus
synchronizing properly faults against the same mapping index. The same
lock can later be used to avoid races when clearing radix tree dirty
tag.
Reviewed-by: default avatarNeilBrown <neilb@suse.com>
Reviewed-by: default avatarRoss Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarRoss Zwisler <ross.zwisler@linux.intel.com>
parent 4f622938
This diff is collapsed.
...@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); ...@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, bool wake_all);
#ifdef CONFIG_FS_DAX #ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n); struct page *read_dax_sector(struct block_device *bdev, sector_t n);
......
...@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping, ...@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
return; return;
/* /*
* Track node that only contains shadow entries. * Track node that only contains shadow entries. DAX mappings contain
* no shadow entries and may contain other exceptional entries so skip
* those.
* *
* Avoid acquiring the list_lru lock if already tracked. The * Avoid acquiring the list_lru lock if already tracked. The
* list_empty() test is safe as node->private_list is * list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock. * protected by mapping->tree_lock.
*/ */
if (!workingset_node_pages(node) && if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
list_empty(&node->private_list)) { list_empty(&node->private_list)) {
node->private_data = mapping; node->private_data = mapping;
list_lru_add(&workingset_shadow_nodes, &node->private_list); list_lru_add(&workingset_shadow_nodes, &node->private_list);
...@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping, ...@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
/* DAX accounts exceptional entries as normal pages */ /* DAX accounts exceptional entries as normal pages */
if (node) if (node)
workingset_node_pages_dec(node); workingset_node_pages_dec(node);
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter(mapping, page->index,
false);
} }
} }
radix_tree_replace_slot(slot, page); radix_tree_replace_slot(slot, page);
......
...@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping, ...@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
if (shmem_mapping(mapping)) if (shmem_mapping(mapping))
return; return;
spin_lock_irq(&mapping->tree_lock);
if (dax_mapping(mapping)) { if (dax_mapping(mapping)) {
if (radix_tree_delete_item(&mapping->page_tree, index, entry)) dax_delete_mapping_entry(mapping, index);
mapping->nrexceptional--; return;
} else {
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
} }
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
unlock: unlock:
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment