Commit 7b6be844 authored by Dan Williams's avatar Dan Williams

dax: refactor dax-fs into a generic provider of 'struct dax_device' instances

We want dax capable drivers to be able to publish a set of dax
operations [1]. However, we do not want to further abuse block_devices
to advertise these operations. Instead we will attach these operations
to a dax device and add a lookup mechanism to go from block device path
to a dax device. A dax capable driver like pmem or brd is responsible
for registering a dax device, alongside a block device, and then a dax
capable filesystem is responsible for retrieving the dax device by path
name if it wants to call dax_operations.

For now, we refactor the dax pseudo-fs to be a generic facility, rather
than an implementation detail, of the device-dax use case. Where a "dax
device" is just an inode + dax infrastructure, and "Device DAX" is a
mapping service layered on top of that base 'struct dax_device'.
"Filesystem DAX" is then a mapping service that layers a filesystem on
top of that same base device. Filesystem DAX is associated with a
block_device for now, but perhaps directly to a dax device in the
future, or for new pmem-only filesystems.

[1]: https://lkml.org/lkml/2017/1/19/880Suggested-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 5f0694b3
...@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT) += parport/ ...@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT) += parport/
obj-$(CONFIG_NVM) += lightnvm/ obj-$(CONFIG_NVM) += lightnvm/
obj-y += base/ block/ misc/ mfd/ nfc/ obj-y += base/ block/ misc/ mfd/ nfc/
obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/
obj-$(CONFIG_DEV_DAX) += dax/ obj-$(CONFIG_DAX) += dax/
obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
obj-$(CONFIG_NUBUS) += nubus/ obj-$(CONFIG_NUBUS) += nubus/
obj-y += macintosh/ obj-y += macintosh/
......
menuconfig DEV_DAX menuconfig DAX
tristate "DAX: direct access to differentiated memory" tristate "DAX: direct access to differentiated memory"
select SRCU
default m if NVDIMM_DAX default m if NVDIMM_DAX
if DAX
config DEV_DAX
tristate "Device DAX: direct access mapping device"
depends on TRANSPARENT_HUGEPAGE depends on TRANSPARENT_HUGEPAGE
select SRCU
help help
Support raw access to differentiated (persistence, bandwidth, Support raw access to differentiated (persistence, bandwidth,
latency...) memory via an mmap(2) capable character latency...) memory via an mmap(2) capable character
...@@ -11,7 +16,6 @@ menuconfig DEV_DAX ...@@ -11,7 +16,6 @@ menuconfig DEV_DAX
baseline memory pool. Mappings of a /dev/daxX.Y device impose baseline memory pool. Mappings of a /dev/daxX.Y device impose
restrictions that make the mapping behavior deterministic. restrictions that make the mapping behavior deterministic.
if DEV_DAX
config DEV_DAX_PMEM config DEV_DAX_PMEM
tristate "PMEM DAX: direct access to persistent memory" tristate "PMEM DAX: direct access to persistent memory"
......
obj-$(CONFIG_DEV_DAX) += dax.o obj-$(CONFIG_DAX) += dax.o
obj-$(CONFIG_DEV_DAX) += device_dax.o
obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
dax-y := super.o
dax_pmem-y := pmem.o dax_pmem-y := pmem.o
device_dax-y := device.o
/* /*
* Copyright(c) 2016 Intel Corporation. All rights reserved. * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as * it under the terms of version 2 of the GNU General Public License as
...@@ -12,14 +12,12 @@ ...@@ -12,14 +12,12 @@
*/ */
#ifndef __DAX_H__ #ifndef __DAX_H__
#define __DAX_H__ #define __DAX_H__
struct device; struct dax_device;
struct dev_dax; struct dax_device *alloc_dax(void *private);
struct resource; void put_dax(struct dax_device *dax_dev);
struct dax_region; bool dax_alive(struct dax_device *dax_dev);
void dax_region_put(struct dax_region *dax_region); void kill_dax(struct dax_device *dax_dev);
struct dax_region *alloc_dax_region(struct device *parent, struct dax_device *inode_dax(struct inode *inode);
int region_id, struct resource *res, unsigned int align, struct inode *dax_inode(struct dax_device *dax_dev);
void *addr, unsigned long flags); void *dax_get_private(struct dax_device *dax_dev);
struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
struct resource *res, int count);
#endif /* __DAX_H__ */ #endif /* __DAX_H__ */
/*
* Copyright(c) 2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef __DEVICE_DAX_H__
#define __DEVICE_DAX_H__
struct device;
struct dev_dax;
struct resource;
struct dax_region;
void dax_region_put(struct dax_region *dax_region);
struct dax_region *alloc_dax_region(struct device *parent,
int region_id, struct resource *res, unsigned int align,
void *addr, unsigned long flags);
struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
struct resource *res, int count);
#endif /* __DEVICE_DAX_H__ */
This diff is collapsed.
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <linux/pfn_t.h> #include <linux/pfn_t.h>
#include "../nvdimm/pfn.h" #include "../nvdimm/pfn.h"
#include "../nvdimm/nd.h" #include "../nvdimm/nd.h"
#include "dax.h" #include "device-dax.h"
struct dax_pmem { struct dax_pmem {
struct device *dev; struct device *dev;
......
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/pagemap.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/magic.h>
#include <linux/cdev.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/fs.h>
static int nr_dax = CONFIG_NR_DEV_DAX;
module_param(nr_dax, int, S_IRUGO);
MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
static dev_t dax_devt;
DEFINE_STATIC_SRCU(dax_srcu);
static struct vfsmount *dax_mnt;
static DEFINE_IDA(dax_minor_ida);
static struct kmem_cache *dax_cache __read_mostly;
static struct super_block *dax_superblock __read_mostly;
int dax_read_lock(void)
{
return srcu_read_lock(&dax_srcu);
}
EXPORT_SYMBOL_GPL(dax_read_lock);
void dax_read_unlock(int id)
{
srcu_read_unlock(&dax_srcu, id);
}
EXPORT_SYMBOL_GPL(dax_read_unlock);
/**
* struct dax_device - anchor object for dax services
* @inode: core vfs
* @cdev: optional character interface for "device dax"
* @private: dax driver private data
* @alive: !alive + rcu grace period == no new operations / mappings
*/
struct dax_device {
struct inode inode;
struct cdev cdev;
void *private;
bool alive;
};
bool dax_alive(struct dax_device *dax_dev)
{
lockdep_assert_held(&dax_srcu);
return dax_dev->alive;
}
EXPORT_SYMBOL_GPL(dax_alive);
/*
* Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
* that any fault handlers or operations that might have seen
* dax_alive(), have completed. Any operations that start after
* synchronize_srcu() has run will abort upon seeing !dax_alive().
*/
void kill_dax(struct dax_device *dax_dev)
{
if (!dax_dev)
return;
dax_dev->alive = false;
synchronize_srcu(&dax_srcu);
dax_dev->private = NULL;
}
EXPORT_SYMBOL_GPL(kill_dax);
static struct inode *dax_alloc_inode(struct super_block *sb)
{
struct dax_device *dax_dev;
dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
return &dax_dev->inode;
}
static struct dax_device *to_dax_dev(struct inode *inode)
{
return container_of(inode, struct dax_device, inode);
}
static void dax_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct dax_device *dax_dev = to_dax_dev(inode);
ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
kmem_cache_free(dax_cache, dax_dev);
}
static void dax_destroy_inode(struct inode *inode)
{
struct dax_device *dax_dev = to_dax_dev(inode);
WARN_ONCE(dax_dev->alive,
"kill_dax() must be called before final iput()\n");
call_rcu(&inode->i_rcu, dax_i_callback);
}
static const struct super_operations dax_sops = {
.statfs = simple_statfs,
.alloc_inode = dax_alloc_inode,
.destroy_inode = dax_destroy_inode,
.drop_inode = generic_delete_inode,
};
static struct dentry *dax_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
}
static struct file_system_type dax_fs_type = {
.name = "dax",
.mount = dax_mount,
.kill_sb = kill_anon_super,
};
static int dax_test(struct inode *inode, void *data)
{
dev_t devt = *(dev_t *) data;
return inode->i_rdev == devt;
}
static int dax_set(struct inode *inode, void *data)
{
dev_t devt = *(dev_t *) data;
inode->i_rdev = devt;
return 0;
}
static struct dax_device *dax_dev_get(dev_t devt)
{
struct dax_device *dax_dev;
struct inode *inode;
inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
dax_test, dax_set, &devt);
if (!inode)
return NULL;
dax_dev = to_dax_dev(inode);
if (inode->i_state & I_NEW) {
dax_dev->alive = true;
inode->i_cdev = &dax_dev->cdev;
inode->i_mode = S_IFCHR;
inode->i_flags = S_DAX;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
unlock_new_inode(inode);
}
return dax_dev;
}
struct dax_device *alloc_dax(void *private)
{
struct dax_device *dax_dev;
dev_t devt;
int minor;
minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
if (minor < 0)
return NULL;
devt = MKDEV(MAJOR(dax_devt), minor);
dax_dev = dax_dev_get(devt);
if (!dax_dev)
goto err_inode;
dax_dev->private = private;
return dax_dev;
err_inode:
ida_simple_remove(&dax_minor_ida, minor);
return NULL;
}
EXPORT_SYMBOL_GPL(alloc_dax);
void put_dax(struct dax_device *dax_dev)
{
if (!dax_dev)
return;
iput(&dax_dev->inode);
}
EXPORT_SYMBOL_GPL(put_dax);
/**
* inode_dax: convert a public inode into its dax_dev
* @inode: An inode with i_cdev pointing to a dax_dev
*
* Note this is not equivalent to to_dax_dev() which is for private
* internal use where we know the inode filesystem type == dax_fs_type.
*/
struct dax_device *inode_dax(struct inode *inode)
{
struct cdev *cdev = inode->i_cdev;
return container_of(cdev, struct dax_device, cdev);
}
EXPORT_SYMBOL_GPL(inode_dax);
struct inode *dax_inode(struct dax_device *dax_dev)
{
return &dax_dev->inode;
}
EXPORT_SYMBOL_GPL(dax_inode);
void *dax_get_private(struct dax_device *dax_dev)
{
return dax_dev->private;
}
EXPORT_SYMBOL_GPL(dax_get_private);
static void init_once(void *_dax_dev)
{
struct dax_device *dax_dev = _dax_dev;
struct inode *inode = &dax_dev->inode;
inode_init_once(inode);
}
static int __dax_fs_init(void)
{
int rc;
dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!dax_cache)
return -ENOMEM;
rc = register_filesystem(&dax_fs_type);
if (rc)
goto err_register_fs;
dax_mnt = kern_mount(&dax_fs_type);
if (IS_ERR(dax_mnt)) {
rc = PTR_ERR(dax_mnt);
goto err_mount;
}
dax_superblock = dax_mnt->mnt_sb;
return 0;
err_mount:
unregister_filesystem(&dax_fs_type);
err_register_fs:
kmem_cache_destroy(dax_cache);
return rc;
}
static void __dax_fs_exit(void)
{
kern_unmount(dax_mnt);
unregister_filesystem(&dax_fs_type);
kmem_cache_destroy(dax_cache);
}
static int __init dax_fs_init(void)
{
int rc;
rc = __dax_fs_init();
if (rc)
return rc;
nr_dax = max(nr_dax, 256);
rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
if (rc)
__dax_fs_exit();
return rc;
}
static void __exit dax_fs_exit(void)
{
unregister_chrdev_region(dax_devt, nr_dax);
ida_destroy(&dax_minor_ida);
__dax_fs_exit();
}
MODULE_AUTHOR("Intel Corporation");
MODULE_LICENSE("GPL v2");
subsys_initcall(dax_fs_init);
module_exit(dax_fs_exit);
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
struct iomap_ops; struct iomap_ops;
int dax_read_lock(void);
void dax_read_unlock(int id);
/* /*
* We use lowest available bit in exceptional entry for locking, one bit for * We use lowest available bit in exceptional entry for locking, one bit for
* the entry size (PMD) and two more to tell us if the entry is a huge zero * the entry size (PMD) and two more to tell us if the entry is a huge zero
......
...@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o ...@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o obj-$(CONFIG_ND_BLK) += nd_blk.o
obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_ACPI_NFIT) += nfit.o obj-$(CONFIG_ACPI_NFIT) += nfit.o
obj-$(CONFIG_DEV_DAX) += dax.o ifeq ($(CONFIG_DAX),m)
obj-$(CONFIG_DAX) += dax.o
endif
obj-$(CONFIG_DEV_DAX) += device_dax.o
obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
nfit-y := $(ACPI_SRC)/core.o nfit-y := $(ACPI_SRC)/core.o
...@@ -48,9 +51,12 @@ nd_blk-y += config_check.o ...@@ -48,9 +51,12 @@ nd_blk-y += config_check.o
nd_e820-y := $(NVDIMM_SRC)/e820.o nd_e820-y := $(NVDIMM_SRC)/e820.o
nd_e820-y += config_check.o nd_e820-y += config_check.o
dax-y := $(DAX_SRC)/dax.o dax-y := $(DAX_SRC)/super.o
dax-y += config_check.o dax-y += config_check.o
device_dax-y := $(DAX_SRC)/device.o
device_dax-y += config_check.o
dax_pmem-y := $(DAX_SRC)/pmem.o dax_pmem-y := $(DAX_SRC)/pmem.o
dax_pmem-y += config_check.o dax_pmem-y += config_check.o
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment