Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
a2f6d9c4
Commit
a2f6d9c4
authored
Nov 13, 2016
by
Theodore Ts'o
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dax-4.10-iomap-pmd' into origin
parents
bc33b0ca
9484ab1b
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
542 additions
and
433 deletions
+542
-433
fs/Kconfig
fs/Kconfig
+0
-1
fs/dax.c
fs/dax.c
+464
-364
fs/ext2/file.c
fs/ext2/file.c
+9
-26
fs/ext4/inode.c
fs/ext4/inode.c
+3
-0
fs/iomap.c
fs/iomap.c
+3
-2
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.c
+5
-21
fs/xfs/xfs_aops.h
fs/xfs/xfs_aops.h
+0
-3
fs/xfs/xfs_file.c
fs/xfs/xfs_file.c
+5
-5
include/linux/dax.h
include/linux/dax.h
+50
-8
include/linux/iomap.h
include/linux/iomap.h
+1
-0
mm/filemap.c
mm/filemap.c
+2
-3
No files found.
fs/Kconfig
View file @
a2f6d9c4
...
...
@@ -55,7 +55,6 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
depends on BROKEN
endif # BLOCK
...
...
fs/dax.c
View file @
a2f6d9c4
...
...
@@ -34,25 +34,11 @@
#include <linux/iomap.h>
#include "internal.h"
/*
* We use lowest available bit in exceptional entry for locking, other two
* bits to determine entry type. In total 3 special bits.
*/
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
RADIX_TREE_EXCEPTIONAL_ENTRY))
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
wait_queue_head_t
wait_table
[
DAX_WAIT_TABLE_ENTRIES
];
static
wait_queue_head_t
wait_table
[
DAX_WAIT_TABLE_ENTRIES
];
static
int
__init
init_dax_wait_table
(
void
)
{
...
...
@@ -64,14 +50,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall
(
init_dax_wait_table
);
static
wait_queue_head_t
*
dax_entry_waitqueue
(
struct
address_space
*
mapping
,
pgoff_t
index
)
{
unsigned
long
hash
=
hash_long
((
unsigned
long
)
mapping
^
index
,
DAX_WAIT_TABLE_BITS
);
return
wait_table
+
hash
;
}
static
long
dax_map_atomic
(
struct
block_device
*
bdev
,
struct
blk_dax_ctl
*
dax
)
{
struct
request_queue
*
q
=
bdev
->
bd_queue
;
...
...
@@ -98,6 +76,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
blk_queue_exit
(
bdev
->
bd_queue
);
}
static
int
dax_is_pmd_entry
(
void
*
entry
)
{
return
(
unsigned
long
)
entry
&
RADIX_DAX_PMD
;
}
static
int
dax_is_pte_entry
(
void
*
entry
)
{
return
!
((
unsigned
long
)
entry
&
RADIX_DAX_PMD
);
}
static
int
dax_is_zero_entry
(
void
*
entry
)
{
return
(
unsigned
long
)
entry
&
RADIX_DAX_HZP
;
}
static
int
dax_is_empty_entry
(
void
*
entry
)
{
return
(
unsigned
long
)
entry
&
RADIX_DAX_EMPTY
;
}
struct
page
*
read_dax_sector
(
struct
block_device
*
bdev
,
sector_t
n
)
{
struct
page
*
page
=
alloc_pages
(
GFP_KERNEL
,
0
);
...
...
@@ -123,19 +121,6 @@ static bool buffer_written(struct buffer_head *bh)
return
buffer_mapped
(
bh
)
&&
!
buffer_unwritten
(
bh
);
}
/*
* When ext4 encounters a hole, it returns without modifying the buffer_head
* which means that we can't trust b_size. To cope with this, we set b_state
* to 0 before calling get_block and, if any bit is set, we know we can trust
* b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
* and would save us time calling get_block repeatedly.
*/
static
bool
buffer_size_valid
(
struct
buffer_head
*
bh
)
{
return
bh
->
b_state
!=
0
;
}
static
sector_t
to_sector
(
const
struct
buffer_head
*
bh
,
const
struct
inode
*
inode
)
{
...
...
@@ -177,8 +162,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
rc
=
get_block
(
inode
,
block
,
bh
,
rw
==
WRITE
);
if
(
rc
)
break
;
if
(
!
buffer_size_valid
(
bh
))
bh
->
b_size
=
1
<<
blkbits
;
bh_max
=
pos
-
first
+
bh
->
b_size
;
bdev
=
bh
->
b_bdev
;
/*
...
...
@@ -300,7 +283,7 @@ EXPORT_SYMBOL_GPL(dax_do_io);
*/
struct
exceptional_entry_key
{
struct
address_space
*
mapping
;
unsigned
long
index
;
pgoff_t
entry_start
;
};
struct
wait_exceptional_entry_queue
{
...
...
@@ -308,6 +291,26 @@ struct wait_exceptional_entry_queue {
struct
exceptional_entry_key
key
;
};
static
wait_queue_head_t
*
dax_entry_waitqueue
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
,
struct
exceptional_entry_key
*
key
)
{
unsigned
long
hash
;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
* queue to the start of that PMD. This ensures that all offsets in
* the range covered by the PMD map to the same bit lock.
*/
if
(
dax_is_pmd_entry
(
entry
))
index
&=
~
((
1UL
<<
(
PMD_SHIFT
-
PAGE_SHIFT
))
-
1
);
key
->
mapping
=
mapping
;
key
->
entry_start
=
index
;
hash
=
hash_long
((
unsigned
long
)
mapping
^
index
,
DAX_WAIT_TABLE_BITS
);
return
wait_table
+
hash
;
}
static
int
wake_exceptional_entry_func
(
wait_queue_t
*
wait
,
unsigned
int
mode
,
int
sync
,
void
*
keyp
)
{
...
...
@@ -316,7 +319,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
container_of
(
wait
,
struct
wait_exceptional_entry_queue
,
wait
);
if
(
key
->
mapping
!=
ewait
->
key
.
mapping
||
key
->
index
!=
ewait
->
key
.
index
)
key
->
entry_start
!=
ewait
->
key
.
entry_start
)
return
0
;
return
autoremove_wake_function
(
wait
,
mode
,
sync
,
NULL
);
}
...
...
@@ -372,24 +375,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
static
void
*
get_unlocked_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
***
slotp
)
{
void
*
ret
,
**
slot
;
void
*
entry
,
**
slot
;
struct
wait_exceptional_entry_queue
ewait
;
wait_queue_head_t
*
wq
=
dax_entry_waitqueue
(
mapping
,
index
)
;
wait_queue_head_t
*
wq
;
init_wait
(
&
ewait
.
wait
);
ewait
.
wait
.
func
=
wake_exceptional_entry_func
;
ewait
.
key
.
mapping
=
mapping
;
ewait
.
key
.
index
=
index
;
for
(;;)
{
ret
=
__radix_tree_lookup
(
&
mapping
->
page_tree
,
index
,
NULL
,
entry
=
__radix_tree_lookup
(
&
mapping
->
page_tree
,
index
,
NULL
,
&
slot
);
if
(
!
ret
||
!
radix_tree_exceptional_entry
(
ret
)
||
if
(
!
entry
||
!
radix_tree_exceptional_entry
(
entry
)
||
!
slot_locked
(
mapping
,
slot
))
{
if
(
slotp
)
*
slotp
=
slot
;
return
ret
;
return
entry
;
}
wq
=
dax_entry_waitqueue
(
mapping
,
index
,
entry
,
&
ewait
.
key
);
prepare_to_wait_exclusive
(
wq
,
&
ewait
.
wait
,
TASK_UNINTERRUPTIBLE
);
spin_unlock_irq
(
&
mapping
->
tree_lock
);
...
...
@@ -399,52 +402,157 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
}
}
static
void
put_locked_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
if
(
!
radix_tree_exceptional_entry
(
entry
))
{
unlock_page
(
entry
);
put_page
(
entry
);
}
else
{
dax_unlock_mapping_entry
(
mapping
,
index
);
}
}
/*
* Called when we are done with radix tree entry we looked up via
* get_unlocked_mapping_entry() and which we didn't lock in the end.
*/
static
void
put_unlocked_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
if
(
!
radix_tree_exceptional_entry
(
entry
))
return
;
/* We have to wake up next waiter for the radix tree entry lock */
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
false
);
}
/*
* Find radix tree entry at given index. If it points to a page, return with
* the page locked. If it points to the exceptional entry, return with the
* radix tree entry locked. If the radix tree doesn't contain given index,
* create empty exceptional entry for the index and return with it locked.
*
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
* happen if there are any 4k entries (either zero pages or DAX entries)
* within the 2MiB range that we are requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
* insertion will fail if it finds any 4k entries already in the tree, and a
* 4k insertion will cause an existing 2MiB entry to be unmapped and
* downgraded to 4k entries. This happens for both 2MiB huge zero pages as
* well as 2MiB empty entries.
*
* The exception to this downgrade path is for 2MiB DAX PMD entries that have
* real storage backing them. We will leave these real 2MiB DAX entries in
* the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
*/
static
void
*
grab_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
)
static
void
*
grab_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
unsigned
long
size_flag
)
{
void
*
ret
,
**
slot
;
bool
pmd_downgrade
=
false
;
/* splitting 2MiB entry into 4k entries? */
void
*
entry
,
**
slot
;
restart:
spin_lock_irq
(
&
mapping
->
tree_lock
);
ret
=
get_unlocked_mapping_entry
(
mapping
,
index
,
&
slot
);
entry
=
get_unlocked_mapping_entry
(
mapping
,
index
,
&
slot
);
if
(
entry
)
{
if
(
size_flag
&
RADIX_DAX_PMD
)
{
if
(
!
radix_tree_exceptional_entry
(
entry
)
||
dax_is_pte_entry
(
entry
))
{
put_unlocked_mapping_entry
(
mapping
,
index
,
entry
);
entry
=
ERR_PTR
(
-
EEXIST
);
goto
out_unlock
;
}
}
else
{
/* trying to grab a PTE entry */
if
(
radix_tree_exceptional_entry
(
entry
)
&&
dax_is_pmd_entry
(
entry
)
&&
(
dax_is_zero_entry
(
entry
)
||
dax_is_empty_entry
(
entry
)))
{
pmd_downgrade
=
true
;
}
}
}
/* No entry for given index? Make sure radix tree is big enough. */
if
(
!
ret
)
{
if
(
!
entry
||
pmd_downgrade
)
{
int
err
;
if
(
pmd_downgrade
)
{
/*
* Make sure 'entry' remains valid while we drop
* mapping->tree_lock.
*/
entry
=
lock_slot
(
mapping
,
slot
);
}
spin_unlock_irq
(
&
mapping
->
tree_lock
);
err
=
radix_tree_preload
(
mapping_gfp_mask
(
mapping
)
&
~
__GFP_HIGHMEM
);
if
(
err
)
if
(
err
)
{
if
(
pmd_downgrade
)
put_locked_mapping_entry
(
mapping
,
index
,
entry
);
return
ERR_PTR
(
err
);
ret
=
(
void
*
)(
RADIX_TREE_EXCEPTIONAL_ENTRY
|
RADIX_DAX_ENTRY_LOCK
);
}
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
if
(
pmd_downgrade
&&
dax_is_zero_entry
(
entry
))
unmap_mapping_range
(
mapping
,
(
index
<<
PAGE_SHIFT
)
&
PMD_MASK
,
PMD_SIZE
,
0
);
spin_lock_irq
(
&
mapping
->
tree_lock
);
err
=
radix_tree_insert
(
&
mapping
->
page_tree
,
index
,
ret
);
if
(
pmd_downgrade
)
{
radix_tree_delete
(
&
mapping
->
page_tree
,
index
);
mapping
->
nrexceptional
--
;
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
true
);
}
entry
=
dax_radix_locked_entry
(
0
,
size_flag
|
RADIX_DAX_EMPTY
);
err
=
__radix_tree_insert
(
&
mapping
->
page_tree
,
index
,
dax_radix_order
(
entry
),
entry
);
radix_tree_preload_end
();
if
(
err
)
{
spin_unlock_irq
(
&
mapping
->
tree_lock
);
/* Someone already created the entry? */
if
(
err
==
-
EEXIST
)
/*
* Someone already created the entry? This is a
* normal failure when inserting PMDs in a range
* that already contains PTEs. In that case we want
* to return -EEXIST immediately.
*/
if
(
err
==
-
EEXIST
&&
!
(
size_flag
&
RADIX_DAX_PMD
))
goto
restart
;
/*
* Our insertion of a DAX PMD entry failed, most
* likely because it collided with a PTE sized entry
* at a different index in the PMD range. We haven't
* inserted anything into the radix tree and have no
* waiters to wake.
*/
return
ERR_PTR
(
err
);
}
/* Good, we have inserted empty locked entry into the tree. */
mapping
->
nrexceptional
++
;
spin_unlock_irq
(
&
mapping
->
tree_lock
);
return
ret
;
return
entry
;
}
/* Normal page in radix tree? */
if
(
!
radix_tree_exceptional_entry
(
ret
))
{
struct
page
*
page
=
ret
;
if
(
!
radix_tree_exceptional_entry
(
entry
))
{
struct
page
*
page
=
entry
;
get_page
(
page
);
spin_unlock_irq
(
&
mapping
->
tree_lock
);
...
...
@@ -457,15 +565,26 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
}
return
page
;
}
ret
=
lock_slot
(
mapping
,
slot
);
entry
=
lock_slot
(
mapping
,
slot
);
out_unlock:
spin_unlock_irq
(
&
mapping
->
tree_lock
);
return
ret
;
return
entry
;
}
/*
* We do not necessarily hold the mapping->tree_lock when we call this
* function so it is possible that 'entry' is no longer a valid item in the
* radix tree. This is okay because all we really need to do is to find the
* correct waitqueue where tasks might be waiting for that old 'entry' and
* wake them.
*/
void
dax_wake_mapping_entry_waiter
(
struct
address_space
*
mapping
,
pgoff_t
index
,
bool
wake_all
)
pgoff_t
index
,
void
*
entry
,
bool
wake_all
)
{
wait_queue_head_t
*
wq
=
dax_entry_waitqueue
(
mapping
,
index
);
struct
exceptional_entry_key
key
;
wait_queue_head_t
*
wq
;
wq
=
dax_entry_waitqueue
(
mapping
,
index
,
entry
,
&
key
);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
...
...
@@ -473,54 +592,24 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
if
(
waitqueue_active
(
wq
))
{
struct
exceptional_entry_key
key
;
key
.
mapping
=
mapping
;
key
.
index
=
index
;
if
(
waitqueue_active
(
wq
))
__wake_up
(
wq
,
TASK_NORMAL
,
wake_all
?
0
:
1
,
&
key
);
}
}
void
dax_unlock_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
)
{
void
*
ret
,
**
slot
;
void
*
entry
,
**
slot
;
spin_lock_irq
(
&
mapping
->
tree_lock
);
ret
=
__radix_tree_lookup
(
&
mapping
->
page_tree
,
index
,
NULL
,
&
slot
);
if
(
WARN_ON_ONCE
(
!
ret
||
!
radix_tree_exceptional_entry
(
ret
)
||
entry
=
__radix_tree_lookup
(
&
mapping
->
page_tree
,
index
,
NULL
,
&
slot
);
if
(
WARN_ON_ONCE
(
!
entry
||
!
radix_tree_exceptional_entry
(
entry
)
||
!
slot_locked
(
mapping
,
slot
)))
{
spin_unlock_irq
(
&
mapping
->
tree_lock
);
return
;
}
unlock_slot
(
mapping
,
slot
);
spin_unlock_irq
(
&
mapping
->
tree_lock
);
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
false
);
}
static
void
put_locked_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
if
(
!
radix_tree_exceptional_entry
(
entry
))
{
unlock_page
(
entry
);
put_page
(
entry
);
}
else
{
dax_unlock_mapping_entry
(
mapping
,
index
);
}
}
/*
* Called when we are done with radix tree entry we looked up via
* get_unlocked_mapping_entry() and which we didn't lock in the end.
*/
static
void
put_unlocked_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
if
(
!
radix_tree_exceptional_entry
(
entry
))
return
;
/* We have to wake up next waiter for the radix tree entry lock */
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
false
);
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
false
);
}
/*
...
...
@@ -547,7 +636,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
radix_tree_delete
(
&
mapping
->
page_tree
,
index
);
mapping
->
nrexceptional
--
;
spin_unlock_irq
(
&
mapping
->
tree_lock
);
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
true
);
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
true
);
return
1
;
}
...
...
@@ -600,11 +689,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
return
0
;
}
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
* PTEs. If we happen to be trying to insert a PTE and there is a PMD
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
static
void
*
dax_insert_mapping_entry
(
struct
address_space
*
mapping
,
struct
vm_fault
*
vmf
,
void
*
entry
,
sector_t
sector
)
void
*
entry
,
sector_t
sector
,
unsigned
long
flags
)
{
struct
radix_tree_root
*
page_tree
=
&
mapping
->
page_tree
;
int
error
=
0
;
...
...
@@ -627,22 +722,35 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
error
=
radix_tree_preload
(
vmf
->
gfp_mask
&
~
__GFP_HIGHMEM
);
if
(
error
)
return
ERR_PTR
(
error
);
}
else
if
(
dax_is_zero_entry
(
entry
)
&&
!
(
flags
&
RADIX_DAX_HZP
))
{
/* replacing huge zero page with PMD block mapping */
unmap_mapping_range
(
mapping
,
(
vmf
->
pgoff
<<
PAGE_SHIFT
)
&
PMD_MASK
,
PMD_SIZE
,
0
);
}
spin_lock_irq
(
&
mapping
->
tree_lock
);
new_entry
=
(
void
*
)((
unsigned
long
)
RADIX_DAX_ENTRY
(
sector
,
false
)
|
RADIX_DAX_ENTRY_LOCK
);
new_entry
=
dax_radix_locked_entry
(
sector
,
flags
);
if
(
hole_fill
)
{
__delete_from_page_cache
(
entry
,
NULL
);
/* Drop pagecache reference */
put_page
(
entry
);
error
=
radix_tree_insert
(
page_tree
,
index
,
new_entry
);
error
=
__radix_tree_insert
(
page_tree
,
index
,
dax_radix_order
(
new_entry
),
new_entry
);
if
(
error
)
{
new_entry
=
ERR_PTR
(
error
);
goto
unlock
;
}
mapping
->
nrexceptional
++
;
}
else
{
}
else
if
(
dax_is_zero_entry
(
entry
)
||
dax_is_empty_entry
(
entry
))
{
/*
* Only swap our new entry into the radix tree if the current
* entry is a zero page or an empty entry. If a normal PTE or
* PMD entry is already in the tree, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
void
**
slot
;
void
*
ret
;
...
...
@@ -672,7 +780,6 @@ static int dax_writeback_one(struct block_device *bdev,
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
struct
radix_tree_root
*
page_tree
=
&
mapping
->
page_tree
;
int
type
=
RADIX_DAX_TYPE
(
entry
);
struct
radix_tree_node
*
node
;
struct
blk_dax_ctl
dax
;
void
**
slot
;
...
...
@@ -693,13 +800,21 @@ static int dax_writeback_one(struct block_device *bdev,
if
(
!
radix_tree_tag_get
(
page_tree
,
index
,
PAGECACHE_TAG_TOWRITE
))
goto
unlock
;
if
(
WARN_ON_ONCE
(
type
!=
RADIX_DAX_PTE
&&
type
!=
RADIX_DAX_PMD
))
{
if
(
WARN_ON_ONCE
(
dax_is_empty_entry
(
entry
)
||
dax_is_zero_entry
(
entry
)))
{
ret
=
-
EIO
;
goto
unlock
;
}
dax
.
sector
=
RADIX_DAX_SECTOR
(
entry
);
dax
.
size
=
(
type
==
RADIX_DAX_PMD
?
PMD_SIZE
:
PAGE_SIZE
);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
* in the middle of a PMD, the 'index' we are given will be aligned to
* the start index of the PMD, as will the sector we pull from
* 'entry'. This allows us to flush for PMD_SIZE and not have to
* worry about partial PMD writebacks.
*/
dax
.
sector
=
dax_radix_sector
(
entry
);
dax
.
size
=
PAGE_SIZE
<<
dax_radix_order
(
entry
);
spin_unlock_irq
(
&
mapping
->
tree_lock
);
/*
...
...
@@ -738,12 +853,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct
block_device
*
bdev
,
struct
writeback_control
*
wbc
)
{
struct
inode
*
inode
=
mapping
->
host
;
pgoff_t
start_index
,
end_index
,
pmd_index
;
pgoff_t
start_index
,
end_index
;
pgoff_t
indices
[
PAGEVEC_SIZE
];
struct
pagevec
pvec
;
bool
done
=
false
;
int
i
,
ret
=
0
;
void
*
entry
;
if
(
WARN_ON_ONCE
(
inode
->
i_blkbits
!=
PAGE_SHIFT
))
return
-
EIO
;
...
...
@@ -753,15 +867,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index
=
wbc
->
range_start
>>
PAGE_SHIFT
;
end_index
=
wbc
->
range_end
>>
PAGE_SHIFT
;
pmd_index
=
DAX_PMD_INDEX
(
start_index
);
rcu_read_lock
();
entry
=
radix_tree_lookup
(
&
mapping
->
page_tree
,
pmd_index
);
rcu_read_unlock
();
/* see if the start of our range is covered by a PMD entry */
if
(
entry
&&
RADIX_DAX_TYPE
(
entry
)
==
RADIX_DAX_PMD
)
start_index
=
pmd_index
;
tag_pages_for_writeback
(
mapping
,
start_index
,
end_index
);
...
...
@@ -806,7 +911,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return
PTR_ERR
(
dax
.
addr
);
dax_unmap_atomic
(
bdev
,
&
dax
);
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
dax
.
sector
);
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
dax
.
sector
,
0
);
if
(
IS_ERR
(
ret
))
return
PTR_ERR
(
ret
);
*
entryp
=
ret
;
...
...
@@ -853,7 +958,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
bh
.
b_bdev
=
inode
->
i_sb
->
s_bdev
;
bh
.
b_size
=
PAGE_SIZE
;
entry
=
grab_mapping_entry
(
mapping
,
vmf
->
pgoff
);
entry
=
grab_mapping_entry
(
mapping
,
vmf
->
pgoff
,
0
);
if
(
IS_ERR
(
entry
))
{
error
=
PTR_ERR
(
entry
);
goto
out
;
...
...
@@ -913,224 +1018,6 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL
(
dax_fault
);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
* more often than one might expect in the below function.
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static
void
__dax_dbg
(
struct
buffer_head
*
bh
,
unsigned
long
address
,
const
char
*
reason
,
const
char
*
fn
)
{
if
(
bh
)
{
char
bname
[
BDEVNAME_SIZE
];
bdevname
(
bh
->
b_bdev
,
bname
);
pr_debug
(
"%s: %s addr: %lx dev %s state %lx start %lld "
"length %zd fallback: %s
\n
"
,
fn
,
current
->
comm
,
address
,
bname
,
bh
->
b_state
,
(
u64
)
bh
->
b_blocknr
,
bh
->
b_size
,
reason
);
}
else
{
pr_debug
(
"%s: %s addr: %lx fallback: %s
\n
"
,
fn
,
current
->
comm
,
address
,
reason
);
}
}
#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
/**
* dax_pmd_fault - handle a PMD fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
*
* When a page fault occurs, filesystems may call this helper in their
* pmd_fault handler for DAX files.
*/
int
dax_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
address
,
pmd_t
*
pmd
,
unsigned
int
flags
,
get_block_t
get_block
)
{
struct
file
*
file
=
vma
->
vm_file
;
struct
address_space
*
mapping
=
file
->
f_mapping
;
struct
inode
*
inode
=
mapping
->
host
;
struct
buffer_head
bh
;
unsigned
blkbits
=
inode
->
i_blkbits
;
unsigned
long
pmd_addr
=
address
&
PMD_MASK
;
bool
write
=
flags
&
FAULT_FLAG_WRITE
;
struct
block_device
*
bdev
;
pgoff_t
size
,
pgoff
;
sector_t
block
;
int
result
=
0
;
bool
alloc
=
false
;
/* dax pmd mappings require pfn_t_devmap() */
if
(
!
IS_ENABLED
(
CONFIG_FS_DAX_PMD
))
return
VM_FAULT_FALLBACK
;
/* Fall back to PTEs if we're going to COW */
if
(
write
&&
!
(
vma
->
vm_flags
&
VM_SHARED
))
{
split_huge_pmd
(
vma
,
pmd
,
address
);
dax_pmd_dbg
(
NULL
,
address
,
"cow write"
);
return
VM_FAULT_FALLBACK
;
}
/* If the PMD would extend outside the VMA */
if
(
pmd_addr
<
vma
->
vm_start
)
{
dax_pmd_dbg
(
NULL
,
address
,
"vma start unaligned"
);
return
VM_FAULT_FALLBACK
;
}
if
((
pmd_addr
+
PMD_SIZE
)
>
vma
->
vm_end
)
{
dax_pmd_dbg
(
NULL
,
address
,
"vma end unaligned"
);
return
VM_FAULT_FALLBACK
;
}
pgoff
=
linear_page_index
(
vma
,
pmd_addr
);
size
=
(
i_size_read
(
inode
)
+
PAGE_SIZE
-
1
)
>>
PAGE_SHIFT
;
if
(
pgoff
>=
size
)
return
VM_FAULT_SIGBUS
;
/* If the PMD would cover blocks out of the file */
if
((
pgoff
|
PG_PMD_COLOUR
)
>=
size
)
{
dax_pmd_dbg
(
NULL
,
address
,
"offset + huge page size > file size"
);
return
VM_FAULT_FALLBACK
;
}
memset
(
&
bh
,
0
,
sizeof
(
bh
));
bh
.
b_bdev
=
inode
->
i_sb
->
s_bdev
;
block
=
(
sector_t
)
pgoff
<<
(
PAGE_SHIFT
-
blkbits
);
bh
.
b_size
=
PMD_SIZE
;
if
(
get_block
(
inode
,
block
,
&
bh
,
0
)
!=
0
)
return
VM_FAULT_SIGBUS
;
if
(
!
buffer_mapped
(
&
bh
)
&&
write
)
{
if
(
get_block
(
inode
,
block
,
&
bh
,
1
)
!=
0
)
return
VM_FAULT_SIGBUS
;
alloc
=
true
;
WARN_ON_ONCE
(
buffer_unwritten
(
&
bh
)
||
buffer_new
(
&
bh
));
}
bdev
=
bh
.
b_bdev
;
/*
* If the filesystem isn't willing to tell us the length of a hole,
* just fall back to PTEs. Calling get_block 512 times in a loop
* would be silly.
*/
if
(
!
buffer_size_valid
(
&
bh
)
||
bh
.
b_size
<
PMD_SIZE
)
{
dax_pmd_dbg
(
&
bh
,
address
,
"allocated block too small"
);
return
VM_FAULT_FALLBACK
;
}
/*
* If we allocated new storage, make sure no process has any
* zero pages covering this hole
*/
if
(
alloc
)
{
loff_t
lstart
=
pgoff
<<
PAGE_SHIFT
;
loff_t
lend
=
lstart
+
PMD_SIZE
-
1
;
/* inclusive */
truncate_pagecache_range
(
inode
,
lstart
,
lend
);
}
if
(
!
write
&&
!
buffer_mapped
(
&
bh
))
{
spinlock_t
*
ptl
;
pmd_t
entry
;
struct
page
*
zero_page
=
mm_get_huge_zero_page
(
vma
->
vm_mm
);
if
(
unlikely
(
!
zero_page
))
{
dax_pmd_dbg
(
&
bh
,
address
,
"no zero page"
);
goto
fallback
;
}
ptl
=
pmd_lock
(
vma
->
vm_mm
,
pmd
);
if
(
!
pmd_none
(
*
pmd
))
{
spin_unlock
(
ptl
);
dax_pmd_dbg
(
&
bh
,
address
,
"pmd already present"
);
goto
fallback
;
}
dev_dbg
(
part_to_dev
(
bdev
->
bd_part
),
"%s: %s addr: %lx pfn: <zero> sect: %llx
\n
"
,
__func__
,
current
->
comm
,
address
,
(
unsigned
long
long
)
to_sector
(
&
bh
,
inode
));
entry
=
mk_pmd
(
zero_page
,
vma
->
vm_page_prot
);
entry
=
pmd_mkhuge
(
entry
);
set_pmd_at
(
vma
->
vm_mm
,
pmd_addr
,
pmd
,
entry
);
result
=
VM_FAULT_NOPAGE
;
spin_unlock
(
ptl
);
}
else
{
struct
blk_dax_ctl
dax
=
{
.
sector
=
to_sector
(
&
bh
,
inode
),
.
size
=
PMD_SIZE
,
};
long
length
=
dax_map_atomic
(
bdev
,
&
dax
);
if
(
length
<
0
)
{
dax_pmd_dbg
(
&
bh
,
address
,
"dax-error fallback"
);
goto
fallback
;
}
if
(
length
<
PMD_SIZE
)
{
dax_pmd_dbg
(
&
bh
,
address
,
"dax-length too small"
);
dax_unmap_atomic
(
bdev
,
&
dax
);
goto
fallback
;
}
if
(
pfn_t_to_pfn
(
dax
.
pfn
)
&
PG_PMD_COLOUR
)
{
dax_pmd_dbg
(
&
bh
,
address
,
"pfn unaligned"
);
dax_unmap_atomic
(
bdev
,
&
dax
);
goto
fallback
;
}
if
(
!
pfn_t_devmap
(
dax
.
pfn
))
{
dax_unmap_atomic
(
bdev
,
&
dax
);
dax_pmd_dbg
(
&
bh
,
address
,
"pfn not in memmap"
);
goto
fallback
;
}
dax_unmap_atomic
(
bdev
,
&
dax
);
/*
* For PTE faults we insert a radix tree entry for reads, and
* leave it clean. Then on the first write we dirty the radix
* tree entry via the dax_pfn_mkwrite() path. This sequence
* allows the dax_pfn_mkwrite() call to be simpler and avoid a
* call into get_block() to translate the pgoff to a sector in
* order to be able to create a new radix tree entry.
*
* The PMD path doesn't have an equivalent to
* dax_pfn_mkwrite(), though, so for a read followed by a
* write we traverse all the way through dax_pmd_fault()
* twice. This means we can just skip inserting a radix tree
* entry completely on the initial read and just wait until
* the write to insert a dirty entry.
*/
if
(
write
)
{
/*
* We should insert radix-tree entry and dirty it here.
* For now this is broken...
*/
}
dev_dbg
(
part_to_dev
(
bdev
->
bd_part
),
"%s: %s addr: %lx pfn: %lx sect: %llx
\n
"
,
__func__
,
current
->
comm
,
address
,
pfn_t_to_pfn
(
dax
.
pfn
),
(
unsigned
long
long
)
dax
.
sector
);
result
|=
vmf_insert_pfn_pmd
(
vma
,
address
,
pmd
,
dax
.
pfn
,
write
);
}
out:
return
result
;
fallback:
count_vm_event
(
THP_FAULT_FALLBACK
);
result
=
VM_FAULT_FALLBACK
;
goto
out
;
}
EXPORT_SYMBOL_GPL
(
dax_pmd_fault
);
#endif
/* CONFIG_TRANSPARENT_HUGEPAGE */
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
...
...
@@ -1214,7 +1101,8 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
/* Block boundary? Nothing to do */
if
(
!
length
)
return
0
;
BUG_ON
((
offset
+
length
)
>
PAGE_SIZE
);
if
(
WARN_ON_ONCE
((
offset
+
length
)
>
PAGE_SIZE
))
return
-
EINVAL
;
memset
(
&
bh
,
0
,
sizeof
(
bh
));
bh
.
b_bdev
=
inode
->
i_sb
->
s_bdev
;
...
...
@@ -1245,8 +1133,13 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
EXPORT_SYMBOL_GPL
(
dax_truncate_page
);
#ifdef CONFIG_FS_IOMAP
static
sector_t
dax_iomap_sector
(
struct
iomap
*
iomap
,
loff_t
pos
)
{
return
iomap
->
blkno
+
(((
pos
&
PAGE_MASK
)
-
iomap
->
offset
)
>>
9
);
}
static
loff_t
iomap_dax
_actor
(
struct
inode
*
inode
,
loff_t
pos
,
loff_t
length
,
void
*
data
,
dax_iomap
_actor
(
struct
inode
*
inode
,
loff_t
pos
,
loff_t
length
,
void
*
data
,
struct
iomap
*
iomap
)
{
struct
iov_iter
*
iter
=
data
;
...
...
@@ -1270,8 +1163,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct
blk_dax_ctl
dax
=
{
0
};
ssize_t
map_len
;
dax
.
sector
=
iomap
->
blkno
+
(((
pos
&
PAGE_MASK
)
-
iomap
->
offset
)
>>
9
);
dax
.
sector
=
dax_iomap_sector
(
iomap
,
pos
);
dax
.
size
=
(
length
+
offset
+
PAGE_SIZE
-
1
)
&
PAGE_MASK
;
map_len
=
dax_map_atomic
(
iomap
->
bdev
,
&
dax
);
if
(
map_len
<
0
)
{
...
...
@@ -1303,7 +1195,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
/**
*
iomap_dax
_rw - Perform I/O to a DAX file
*
dax_iomap
_rw - Perform I/O to a DAX file
* @iocb: The control block for this I/O
* @iter: The addresses to do I/O from or to
* @ops: iomap ops passed from the file system
...
...
@@ -1313,7 +1205,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* and evicting any page cache pages in the region under I/O.
*/
ssize_t
iomap_dax
_rw
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
,
dax_iomap
_rw
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
,
struct
iomap_ops
*
ops
)
{
struct
address_space
*
mapping
=
iocb
->
ki_filp
->
f_mapping
;
...
...
@@ -1343,7 +1235,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
while
(
iov_iter_count
(
iter
))
{
ret
=
iomap_apply
(
inode
,
pos
,
iov_iter_count
(
iter
),
flags
,
ops
,
iter
,
iomap_dax
_actor
);
iter
,
dax_iomap
_actor
);
if
(
ret
<=
0
)
break
;
pos
+=
ret
;
...
...
@@ -1353,10 +1245,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
iocb
->
ki_pos
+=
done
;
return
done
?
done
:
ret
;
}
EXPORT_SYMBOL_GPL
(
iomap_dax
_rw
);
EXPORT_SYMBOL_GPL
(
dax_iomap
_rw
);
/**
*
iomap_dax
_fault - handle a page fault on a DAX file
*
dax_iomap
_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @ops: iomap ops passed from the file system
...
...
@@ -1365,7 +1257,7 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
* or mkwrite handler for DAX files. Assumes the caller has done all the
* necessary locking for the page fault to proceed successfully.
*/
int
iomap_dax
_fault
(
struct
vm_area_struct
*
vma
,
struct
vm_fault
*
vmf
,
int
dax_iomap
_fault
(
struct
vm_area_struct
*
vma
,
struct
vm_fault
*
vmf
,
struct
iomap_ops
*
ops
)
{
struct
address_space
*
mapping
=
vma
->
vm_file
->
f_mapping
;
...
...
@@ -1374,8 +1266,9 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t
pos
=
(
loff_t
)
vmf
->
pgoff
<<
PAGE_SHIFT
;
sector_t
sector
;
struct
iomap
iomap
=
{
0
};
unsigned
flags
=
0
;
unsigned
flags
=
IOMAP_FAULT
;
int
error
,
major
=
0
;
int
locked_status
=
0
;
void
*
entry
;
/*
...
...
@@ -1386,7 +1279,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if
(
pos
>=
i_size_read
(
inode
))
return
VM_FAULT_SIGBUS
;
entry
=
grab_mapping_entry
(
mapping
,
vmf
->
pgoff
);
entry
=
grab_mapping_entry
(
mapping
,
vmf
->
pgoff
,
0
);
if
(
IS_ERR
(
entry
))
{
error
=
PTR_ERR
(
entry
);
goto
out
;
...
...
@@ -1405,10 +1298,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
goto
unlock_entry
;
if
(
WARN_ON_ONCE
(
iomap
.
offset
+
iomap
.
length
<
pos
+
PAGE_SIZE
))
{
error
=
-
EIO
;
/* fs corruption? */
goto
unlock_entry
;
goto
finish_iomap
;
}
sector
=
iomap
.
blkno
+
(((
pos
&
PAGE_MASK
)
-
iomap
.
offset
)
>>
9
);
sector
=
dax_iomap_sector
(
&
iomap
,
pos
);
if
(
vmf
->
cow_page
)
{
switch
(
iomap
.
type
)
{
...
...
@@ -1427,13 +1320,15 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
if
(
error
)
goto
unlock_entry
;
goto
finish_iomap
;
if
(
!
radix_tree_exceptional_entry
(
entry
))
{
vmf
->
page
=
entry
;
return
VM_FAULT_LOCKED
;
locked_status
=
VM_FAULT_LOCKED
;
}
else
{
vmf
->
entry
=
entry
;
locked_status
=
VM_FAULT_DAX_LOCKED
;
}
vmf
->
entry
=
entry
;
return
VM_FAULT_DAX_LOCKED
;
goto
finish_iomap
;
}
switch
(
iomap
.
type
)
{
...
...
@@ -1448,8 +1343,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break
;
case
IOMAP_UNWRITTEN
:
case
IOMAP_HOLE
:
if
(
!
(
vmf
->
flags
&
FAULT_FLAG_WRITE
))
return
dax_load_hole
(
mapping
,
entry
,
vmf
);
if
(
!
(
vmf
->
flags
&
FAULT_FLAG_WRITE
))
{
locked_status
=
dax_load_hole
(
mapping
,
entry
,
vmf
);
break
;
}
/*FALLTHRU*/
default:
WARN_ON_ONCE
(
1
);
...
...
@@ -1457,15 +1354,218 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break
;
}
finish_iomap:
if
(
ops
->
iomap_end
)
{
if
(
error
)
{
/* keep previous error */
ops
->
iomap_end
(
inode
,
pos
,
PAGE_SIZE
,
0
,
flags
,
&
iomap
);
}
else
{
error
=
ops
->
iomap_end
(
inode
,
pos
,
PAGE_SIZE
,
PAGE_SIZE
,
flags
,
&
iomap
);
}
}
unlock_entry:
put_locked_mapping_entry
(
mapping
,
vmf
->
pgoff
,
entry
);
if
(
!
locked_status
||
error
)
put_locked_mapping_entry
(
mapping
,
vmf
->
pgoff
,
entry
);
out:
if
(
error
==
-
ENOMEM
)
return
VM_FAULT_OOM
|
major
;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if
(
error
<
0
&&
error
!=
-
EBUSY
)
return
VM_FAULT_SIGBUS
|
major
;
if
(
locked_status
)
{
WARN_ON_ONCE
(
error
);
/* -EBUSY from ops->iomap_end? */
return
locked_status
;
}
return
VM_FAULT_NOPAGE
|
major
;
}
EXPORT_SYMBOL_GPL
(
iomap_dax_fault
);
EXPORT_SYMBOL_GPL
(
dax_iomap_fault
);
#ifdef CONFIG_FS_DAX_PMD
/*
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
* more often than one might expect in the below functions.
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static
int
dax_pmd_insert_mapping
(
struct
vm_area_struct
*
vma
,
pmd_t
*
pmd
,
struct
vm_fault
*
vmf
,
unsigned
long
address
,
struct
iomap
*
iomap
,
loff_t
pos
,
bool
write
,
void
**
entryp
)
{
struct
address_space
*
mapping
=
vma
->
vm_file
->
f_mapping
;
struct
block_device
*
bdev
=
iomap
->
bdev
;
struct
blk_dax_ctl
dax
=
{
.
sector
=
dax_iomap_sector
(
iomap
,
pos
),
.
size
=
PMD_SIZE
,
};
long
length
=
dax_map_atomic
(
bdev
,
&
dax
);
void
*
ret
;
if
(
length
<
0
)
/* dax_map_atomic() failed */
return
VM_FAULT_FALLBACK
;
if
(
length
<
PMD_SIZE
)
goto
unmap_fallback
;
if
(
pfn_t_to_pfn
(
dax
.
pfn
)
&
PG_PMD_COLOUR
)
goto
unmap_fallback
;
if
(
!
pfn_t_devmap
(
dax
.
pfn
))
goto
unmap_fallback
;
dax_unmap_atomic
(
bdev
,
&
dax
);
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
*
entryp
,
dax
.
sector
,
RADIX_DAX_PMD
);
if
(
IS_ERR
(
ret
))
return
VM_FAULT_FALLBACK
;
*
entryp
=
ret
;
return
vmf_insert_pfn_pmd
(
vma
,
address
,
pmd
,
dax
.
pfn
,
write
);
unmap_fallback:
dax_unmap_atomic
(
bdev
,
&
dax
);
return
VM_FAULT_FALLBACK
;
}
static
int
dax_pmd_load_hole
(
struct
vm_area_struct
*
vma
,
pmd_t
*
pmd
,
struct
vm_fault
*
vmf
,
unsigned
long
address
,
struct
iomap
*
iomap
,
void
**
entryp
)
{
struct
address_space
*
mapping
=
vma
->
vm_file
->
f_mapping
;
unsigned
long
pmd_addr
=
address
&
PMD_MASK
;
struct
page
*
zero_page
;
spinlock_t
*
ptl
;
pmd_t
pmd_entry
;
void
*
ret
;
zero_page
=
mm_get_huge_zero_page
(
vma
->
vm_mm
);
if
(
unlikely
(
!
zero_page
))
return
VM_FAULT_FALLBACK
;
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
*
entryp
,
0
,
RADIX_DAX_PMD
|
RADIX_DAX_HZP
);
if
(
IS_ERR
(
ret
))
return
VM_FAULT_FALLBACK
;
*
entryp
=
ret
;
ptl
=
pmd_lock
(
vma
->
vm_mm
,
pmd
);
if
(
!
pmd_none
(
*
pmd
))
{
spin_unlock
(
ptl
);
return
VM_FAULT_FALLBACK
;
}
pmd_entry
=
mk_pmd
(
zero_page
,
vma
->
vm_page_prot
);
pmd_entry
=
pmd_mkhuge
(
pmd_entry
);
set_pmd_at
(
vma
->
vm_mm
,
pmd_addr
,
pmd
,
pmd_entry
);
spin_unlock
(
ptl
);
return
VM_FAULT_NOPAGE
;
}
int
dax_iomap_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
address
,
pmd_t
*
pmd
,
unsigned
int
flags
,
struct
iomap_ops
*
ops
)
{
struct
address_space
*
mapping
=
vma
->
vm_file
->
f_mapping
;
unsigned
long
pmd_addr
=
address
&
PMD_MASK
;
bool
write
=
flags
&
FAULT_FLAG_WRITE
;
unsigned
int
iomap_flags
=
(
write
?
IOMAP_WRITE
:
0
)
|
IOMAP_FAULT
;
struct
inode
*
inode
=
mapping
->
host
;
int
result
=
VM_FAULT_FALLBACK
;
struct
iomap
iomap
=
{
0
};
pgoff_t
max_pgoff
,
pgoff
;
struct
vm_fault
vmf
;
void
*
entry
;
loff_t
pos
;
int
error
;
/* Fall back to PTEs if we're going to COW */
if
(
write
&&
!
(
vma
->
vm_flags
&
VM_SHARED
))
goto
fallback
;
/* If the PMD would extend outside the VMA */
if
(
pmd_addr
<
vma
->
vm_start
)
goto
fallback
;
if
((
pmd_addr
+
PMD_SIZE
)
>
vma
->
vm_end
)
goto
fallback
;
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
pgoff
=
linear_page_index
(
vma
,
pmd_addr
);
max_pgoff
=
(
i_size_read
(
inode
)
-
1
)
>>
PAGE_SHIFT
;
if
(
pgoff
>
max_pgoff
)
return
VM_FAULT_SIGBUS
;
/* If the PMD would extend beyond the file size */
if
((
pgoff
|
PG_PMD_COLOUR
)
>
max_pgoff
)
goto
fallback
;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry
=
grab_mapping_entry
(
mapping
,
pgoff
,
RADIX_DAX_PMD
);
if
(
IS_ERR
(
entry
))
goto
fallback
;
/*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
*/
pos
=
(
loff_t
)
pgoff
<<
PAGE_SHIFT
;
error
=
ops
->
iomap_begin
(
inode
,
pos
,
PMD_SIZE
,
iomap_flags
,
&
iomap
);
if
(
error
)
goto
unlock_entry
;
if
(
iomap
.
offset
+
iomap
.
length
<
pos
+
PMD_SIZE
)
goto
finish_iomap
;
vmf
.
pgoff
=
pgoff
;
vmf
.
flags
=
flags
;
vmf
.
gfp_mask
=
mapping_gfp_mask
(
mapping
)
|
__GFP_IO
;
switch
(
iomap
.
type
)
{
case
IOMAP_MAPPED
:
result
=
dax_pmd_insert_mapping
(
vma
,
pmd
,
&
vmf
,
address
,
&
iomap
,
pos
,
write
,
&
entry
);
break
;
case
IOMAP_UNWRITTEN
:
case
IOMAP_HOLE
:
if
(
WARN_ON_ONCE
(
write
))
goto
finish_iomap
;
result
=
dax_pmd_load_hole
(
vma
,
pmd
,
&
vmf
,
address
,
&
iomap
,
&
entry
);
break
;
default:
WARN_ON_ONCE
(
1
);
break
;
}
finish_iomap:
if
(
ops
->
iomap_end
)
{
if
(
result
==
VM_FAULT_FALLBACK
)
{
ops
->
iomap_end
(
inode
,
pos
,
PMD_SIZE
,
0
,
iomap_flags
,
&
iomap
);
}
else
{
error
=
ops
->
iomap_end
(
inode
,
pos
,
PMD_SIZE
,
PMD_SIZE
,
iomap_flags
,
&
iomap
);
if
(
error
)
result
=
VM_FAULT_FALLBACK
;
}
}
unlock_entry:
put_locked_mapping_entry
(
mapping
,
pgoff
,
entry
);
fallback:
if
(
result
==
VM_FAULT_FALLBACK
)
{
split_huge_pmd
(
vma
,
pmd
,
address
);
count_vm_event
(
THP_FAULT_FALLBACK
);
}
return
result
;
}
EXPORT_SYMBOL_GPL
(
dax_iomap_pmd_fault
);
#endif
/* CONFIG_FS_DAX_PMD */
#endif
/* CONFIG_FS_IOMAP */
fs/ext2/file.c
View file @
a2f6d9c4
...
...
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
return
0
;
/* skip atime */
inode_lock_shared
(
inode
);
ret
=
iomap_dax
_rw
(
iocb
,
to
,
&
ext2_iomap_ops
);
ret
=
dax_iomap
_rw
(
iocb
,
to
,
&
ext2_iomap_ops
);
inode_unlock_shared
(
inode
);
file_accessed
(
iocb
->
ki_filp
);
...
...
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
if
(
ret
)
goto
out_unlock
;
ret
=
iomap_dax
_rw
(
iocb
,
from
,
&
ext2_iomap_ops
);
ret
=
dax_iomap
_rw
(
iocb
,
from
,
&
ext2_iomap_ops
);
if
(
ret
>
0
&&
iocb
->
ki_pos
>
i_size_read
(
inode
))
{
i_size_write
(
inode
,
iocb
->
ki_pos
);
mark_inode_dirty
(
inode
);
...
...
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read
(
&
ei
->
dax_sem
);
ret
=
iomap_dax
_fault
(
vma
,
vmf
,
&
ext2_iomap_ops
);
ret
=
dax_iomap
_fault
(
vma
,
vmf
,
&
ext2_iomap_ops
);
up_read
(
&
ei
->
dax_sem
);
if
(
vmf
->
flags
&
FAULT_FLAG_WRITE
)
...
...
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return
ret
;
}
static
int
ext2_dax_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
addr
,
pmd_t
*
pmd
,
unsigned
int
flags
)
{
struct
inode
*
inode
=
file_inode
(
vma
->
vm_file
);
struct
ext2_inode_info
*
ei
=
EXT2_I
(
inode
);
int
ret
;
if
(
flags
&
FAULT_FLAG_WRITE
)
{
sb_start_pagefault
(
inode
->
i_sb
);
file_update_time
(
vma
->
vm_file
);
}
down_read
(
&
ei
->
dax_sem
);
ret
=
dax_pmd_fault
(
vma
,
addr
,
pmd
,
flags
,
ext2_get_block
);
up_read
(
&
ei
->
dax_sem
);
if
(
flags
&
FAULT_FLAG_WRITE
)
sb_end_pagefault
(
inode
->
i_sb
);
return
ret
;
}
static
int
ext2_dax_pfn_mkwrite
(
struct
vm_area_struct
*
vma
,
struct
vm_fault
*
vmf
)
{
...
...
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
static
const
struct
vm_operations_struct
ext2_dax_vm_ops
=
{
.
fault
=
ext2_dax_fault
,
.
pmd_fault
=
ext2_dax_pmd_fault
,
/*
* .pmd_fault is not supported for DAX because allocation in ext2
* cannot be reliably aligned to huge page sizes and so pmd faults
* will always fail and fail back to regular faults.
*/
.
page_mkwrite
=
ext2_dax_fault
,
.
pfn_mkwrite
=
ext2_dax_pfn_mkwrite
,
};
...
...
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed
(
file
);
vma
->
vm_ops
=
&
ext2_dax_vm_ops
;
vma
->
vm_flags
|=
VM_MIXEDMAP
|
VM_HUGEPAGE
;
vma
->
vm_flags
|=
VM_MIXEDMAP
;
return
0
;
}
#else
...
...
fs/ext4/inode.c
View file @
a2f6d9c4
...
...
@@ -767,6 +767,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_update_bh_state
(
bh
,
map
.
m_flags
);
bh
->
b_size
=
inode
->
i_sb
->
s_blocksize
*
map
.
m_len
;
ret
=
0
;
}
else
if
(
ret
==
0
)
{
/* hole case, need to fill in bh->b_size */
bh
->
b_size
=
inode
->
i_sb
->
s_blocksize
*
map
.
m_len
;
}
return
ret
;
}
...
...
fs/iomap.c
View file @
a2f6d9c4
...
...
@@ -467,8 +467,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
offset
=
page_offset
(
page
);
while
(
length
>
0
)
{
ret
=
iomap_apply
(
inode
,
offset
,
length
,
IOMAP_WRITE
,
ops
,
page
,
iomap_page_mkwrite_actor
);
ret
=
iomap_apply
(
inode
,
offset
,
length
,
IOMAP_WRITE
|
IOMAP_FAULT
,
ops
,
page
,
iomap_page_mkwrite_actor
);
if
(
unlikely
(
ret
<=
0
))
goto
out_unlock
;
offset
+=
ret
;
...
...
fs/xfs/xfs_aops.c
View file @
a2f6d9c4
...
...
@@ -1298,8 +1298,7 @@ __xfs_get_blocks(
sector_t
iblock
,
struct
buffer_head
*
bh_result
,
int
create
,
bool
direct
,
bool
dax_fault
)
bool
direct
)
{
struct
xfs_inode
*
ip
=
XFS_I
(
inode
);
struct
xfs_mount
*
mp
=
ip
->
i_mount
;
...
...
@@ -1420,13 +1419,8 @@ __xfs_get_blocks(
if
(
ISUNWRITTEN
(
&
imap
))
set_buffer_unwritten
(
bh_result
);
/* direct IO needs special help */
if
(
create
)
{
if
(
dax_fault
)
ASSERT
(
!
ISUNWRITTEN
(
&
imap
));
else
xfs_map_direct
(
inode
,
bh_result
,
&
imap
,
offset
,
is_cow
);
}
if
(
create
)
xfs_map_direct
(
inode
,
bh_result
,
&
imap
,
offset
,
is_cow
);
}
/*
...
...
@@ -1466,7 +1460,7 @@ xfs_get_blocks(
struct
buffer_head
*
bh_result
,
int
create
)
{
return
__xfs_get_blocks
(
inode
,
iblock
,
bh_result
,
create
,
false
,
false
);
return
__xfs_get_blocks
(
inode
,
iblock
,
bh_result
,
create
,
false
);
}
int
...
...
@@ -1476,17 +1470,7 @@ xfs_get_blocks_direct(
struct
buffer_head
*
bh_result
,
int
create
)
{
return
__xfs_get_blocks
(
inode
,
iblock
,
bh_result
,
create
,
true
,
false
);
}
int
xfs_get_blocks_dax_fault
(
struct
inode
*
inode
,
sector_t
iblock
,
struct
buffer_head
*
bh_result
,
int
create
)
{
return
__xfs_get_blocks
(
inode
,
iblock
,
bh_result
,
create
,
true
,
true
);
return
__xfs_get_blocks
(
inode
,
iblock
,
bh_result
,
create
,
true
);
}
/*
...
...
fs/xfs/xfs_aops.h
View file @
a2f6d9c4
...
...
@@ -59,9 +59,6 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct
buffer_head
*
map_bh
,
int
create
);
int
xfs_get_blocks_direct
(
struct
inode
*
inode
,
sector_t
offset
,
struct
buffer_head
*
map_bh
,
int
create
);
int
xfs_get_blocks_dax_fault
(
struct
inode
*
inode
,
sector_t
offset
,
struct
buffer_head
*
map_bh
,
int
create
);
int
xfs_end_io_direct_write
(
struct
kiocb
*
iocb
,
loff_t
offset
,
ssize_t
size
,
void
*
private
);
int
xfs_setfilesize
(
struct
xfs_inode
*
ip
,
xfs_off_t
offset
,
size_t
size
);
...
...
fs/xfs/xfs_file.c
View file @
a2f6d9c4
...
...
@@ -318,7 +318,7 @@ xfs_file_dax_read(
return
0
;
/* skip atime */
xfs_rw_ilock
(
ip
,
XFS_IOLOCK_SHARED
);
ret
=
iomap_dax
_rw
(
iocb
,
to
,
&
xfs_iomap_ops
);
ret
=
dax_iomap
_rw
(
iocb
,
to
,
&
xfs_iomap_ops
);
xfs_rw_iunlock
(
ip
,
XFS_IOLOCK_SHARED
);
file_accessed
(
iocb
->
ki_filp
);
...
...
@@ -653,7 +653,7 @@ xfs_file_dax_write(
trace_xfs_file_dax_write
(
ip
,
count
,
pos
);
ret
=
iomap_dax
_rw
(
iocb
,
from
,
&
xfs_iomap_ops
);
ret
=
dax_iomap
_rw
(
iocb
,
from
,
&
xfs_iomap_ops
);
if
(
ret
>
0
&&
iocb
->
ki_pos
>
i_size_read
(
inode
))
{
i_size_write
(
inode
,
iocb
->
ki_pos
);
error
=
xfs_setfilesize
(
ip
,
pos
,
ret
);
...
...
@@ -1474,7 +1474,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock
(
XFS_I
(
inode
),
XFS_MMAPLOCK_SHARED
);
if
(
IS_DAX
(
inode
))
{
ret
=
iomap_dax
_fault
(
vma
,
vmf
,
&
xfs_iomap_ops
);
ret
=
dax_iomap
_fault
(
vma
,
vmf
,
&
xfs_iomap_ops
);
}
else
{
ret
=
iomap_page_mkwrite
(
vma
,
vmf
,
&
xfs_iomap_ops
);
ret
=
block_page_mkwrite_return
(
ret
);
...
...
@@ -1508,7 +1508,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
ret
=
iomap_dax
_fault
(
vma
,
vmf
,
&
xfs_iomap_ops
);
ret
=
dax_iomap
_fault
(
vma
,
vmf
,
&
xfs_iomap_ops
);
}
else
ret
=
filemap_fault
(
vma
,
vmf
);
xfs_iunlock
(
XFS_I
(
inode
),
XFS_MMAPLOCK_SHARED
);
...
...
@@ -1545,7 +1545,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock
(
XFS_I
(
inode
),
XFS_MMAPLOCK_SHARED
);
ret
=
dax_
pmd_fault
(
vma
,
addr
,
pmd
,
flags
,
xfs_get_blocks_dax_fault
);
ret
=
dax_
iomap_pmd_fault
(
vma
,
addr
,
pmd
,
flags
,
&
xfs_iomap_ops
);
xfs_iunlock
(
XFS_I
(
inode
),
XFS_MMAPLOCK_SHARED
);
if
(
flags
&
FAULT_FLAG_WRITE
)
...
...
include/linux/dax.h
View file @
a2f6d9c4
...
...
@@ -8,21 +8,46 @@
struct
iomap_ops
;
/* We use lowest available exceptional entry bit for locking */
/*
* We use lowest available bit in exceptional entry for locking, one bit for
* the entry size (PMD) and two more to tell us if the entry is a huge zero
* page (HZP) or an empty entry that is just used for locking. In total four
* special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
* EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
ssize_t
iomap_dax_rw
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
,
static
inline
unsigned
long
dax_radix_sector
(
void
*
entry
)
{
return
(
unsigned
long
)
entry
>>
RADIX_DAX_SHIFT
;
}
static
inline
void
*
dax_radix_locked_entry
(
sector_t
sector
,
unsigned
long
flags
)
{
return
(
void
*
)(
RADIX_TREE_EXCEPTIONAL_ENTRY
|
flags
|
((
unsigned
long
)
sector
<<
RADIX_DAX_SHIFT
)
|
RADIX_DAX_ENTRY_LOCK
);
}
ssize_t
dax_iomap_rw
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
,
struct
iomap_ops
*
ops
);
ssize_t
dax_do_io
(
struct
kiocb
*
,
struct
inode
*
,
struct
iov_iter
*
,
get_block_t
,
dio_iodone_t
,
int
flags
);
int
dax_zero_page_range
(
struct
inode
*
,
loff_t
from
,
unsigned
len
,
get_block_t
);
int
dax_truncate_page
(
struct
inode
*
,
loff_t
from
,
get_block_t
);
int
iomap_dax
_fault
(
struct
vm_area_struct
*
vma
,
struct
vm_fault
*
vmf
,
int
dax_iomap
_fault
(
struct
vm_area_struct
*
vma
,
struct
vm_fault
*
vmf
,
struct
iomap_ops
*
ops
);
int
dax_fault
(
struct
vm_area_struct
*
,
struct
vm_fault
*
,
get_block_t
);
int
dax_delete_mapping_entry
(
struct
address_space
*
mapping
,
pgoff_t
index
);
void
dax_wake_mapping_entry_waiter
(
struct
address_space
*
mapping
,
pgoff_t
index
,
bool
wake_all
);
pgoff_t
index
,
void
*
entry
,
bool
wake_all
);
#ifdef CONFIG_FS_DAX
struct
page
*
read_dax_sector
(
struct
block_device
*
bdev
,
sector_t
n
);
...
...
@@ -48,15 +73,32 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
}
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
int
dax_pmd_fault
(
struct
vm_area_struct
*
,
unsigned
long
addr
,
pmd_t
*
,
unsigned
int
flags
,
get_block_t
);
#else
static
inline
int
dax_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
addr
,
pmd_t
*
pmd
,
unsigned
int
flags
,
get_block_t
gb
)
{
return
VM_FAULT_FALLBACK
;
}
#ifdef CONFIG_FS_DAX_PMD
static
inline
unsigned
int
dax_radix_order
(
void
*
entry
)
{
if
((
unsigned
long
)
entry
&
RADIX_DAX_PMD
)
return
PMD_SHIFT
-
PAGE_SHIFT
;
return
0
;
}
int
dax_iomap_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
address
,
pmd_t
*
pmd
,
unsigned
int
flags
,
struct
iomap_ops
*
ops
);
#else
static
inline
unsigned
int
dax_radix_order
(
void
*
entry
)
{
return
0
;
}
static
inline
int
dax_iomap_pmd_fault
(
struct
vm_area_struct
*
vma
,
unsigned
long
address
,
pmd_t
*
pmd
,
unsigned
int
flags
,
struct
iomap_ops
*
ops
)
{
return
VM_FAULT_FALLBACK
;
}
#endif
int
dax_pfn_mkwrite
(
struct
vm_area_struct
*
,
struct
vm_fault
*
);
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
...
...
include/linux/iomap.h
View file @
a2f6d9c4
...
...
@@ -49,6 +49,7 @@ struct iomap {
#define IOMAP_WRITE (1 << 0)
/* writing, must allocate blocks */
#define IOMAP_ZERO (1 << 1)
/* zeroing operation, may skip holes */
#define IOMAP_REPORT (1 << 2)
/* report extent status, e.g. FIEMAP */
#define IOMAP_FAULT (1 << 3)
/* mapping for page fault */
struct
iomap_ops
{
/*
...
...
mm/filemap.c
View file @
a2f6d9c4
...
...
@@ -137,13 +137,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
}
else
{
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE
(
p
!=
(
void
*
)(
RADIX_TREE_EXCEPTIONAL_ENTRY
|
RADIX_DAX_ENTRY_LOCK
));
dax_radix_locked_entry
(
0
,
RADIX_DAX_EMPTY
));
/* DAX accounts exceptional entries as normal pages */
if
(
node
)
workingset_node_pages_dec
(
node
);
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter
(
mapping
,
page
->
index
,
dax_wake_mapping_entry_waiter
(
mapping
,
page
->
index
,
p
,
false
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment