dax: refactor dax-fs into a generic provider of 'struct dax_device' instances

We want dax capable drivers to be able to publish a set of dax operations [1]. However, we do not want to further abuse block_devices to advertise these operations. Instead we will attach these operations to a dax device and add a lookup mechanism to go from block device path to a dax device. A dax capable driver like pmem or brd is responsible for registering a dax device, alongside a block device, and then a dax capable filesystem is responsible for retrieving the dax device by path name if it wants to call dax_operations. For now, we refactor the dax pseudo-fs to be a generic facility, rather than an implementation detail, of the device-dax use case. Where a "dax device" is just an inode + dax infrastructure, and "Device DAX" is a mapping service layered on top of that base 'struct dax_device'. "Filesystem DAX" is then a mapping service that layers a filesystem on top of that same base device. Filesystem DAX is associated with a block_device for now, but perhaps directly to a dax device in the future, or for new pmem-only filesystems. [1]: https://lkml.org/lkml/2017/1/19/880Suggested-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

dax: refactor dax-fs into a generic provider of 'struct dax_device' instances
We want dax capable drivers to be able to publish a set of dax operations [1]. However, we do not want to further abuse block_devices to advertise these operations. Instead we will attach these operations to a dax device and add a lookup mechanism to go from block device path to a dax device. A dax capable driver like pmem or brd is responsible for registering a dax device, alongside a block device, and then a dax capable filesystem is responsible for retrieving the dax device by path name if it wants to call dax_operations. For now, we refactor the dax pseudo-fs to be a generic facility, rather than an implementation detail, of the device-dax use case. Where a "dax device" is just an inode + dax infrastructure, and "Device DAX" is a mapping service layered on top of that base 'struct dax_device'. "Filesystem DAX" is then a mapping service that layers a filesystem on top of that same base device. Filesystem DAX is associated with a block_device for now, but perhaps directly to a dax device in the future, or for new pmem-only filesystems. [1]: https://lkml.org/lkml/2017/1/19/880Suggested-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
7b6be844 · Dan Williams · 5f0694b3 · 7b6be844 · 7b6be844 · 7b6be844
Commit 7b6be844 authored Apr 11, 2017 by Dan Williams
10 changed files
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
-obj-$(CONFIG_DEV_DAX)		+= dax/
+obj-$(CONFIG_DAX)		+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/

--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
-menuconfig DEV_DAX
+menuconfig DAX
 	tristate "DAX: direct access to differentiated memory"
+	select SRCU
 	default m if NVDIMM_DAX
+if DAX
+config DEV_DAX
+	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
-	select SRCU
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
@@ -11,7 +16,6 @@ menuconfig DEV_DAX
 	  baseline memory pool.  Mappings of a /dev/daxX.Y device impose
 	  restrictions that make the mapping behavior deterministic.
-if DEV_DAX
 config DEV_DAX_PMEM
 	tristate "PMEM DAX: direct access to persistent memory"

--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
-obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+dax-y := super.o
 dax_pmem-y := pmem.o
+device_dax-y := device.o
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
 /*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
@@ -12,14 +12,12 @@
 */
 #ifndef __DAX_H__
 #define __DAX_H__
-struct device;
+struct dax_device;
-struct dev_dax;
+struct dax_device *alloc_dax(void *private);
-struct resource;
+void put_dax(struct dax_device *dax_dev);
-struct dax_region;
+bool dax_alive(struct dax_device *dax_dev);
-void dax_region_put(struct dax_region *dax_region);
+void kill_dax(struct dax_device *dax_dev);
-struct dax_region *alloc_dax_region(struct device *parent,
+struct dax_device *inode_dax(struct inode *inode);
-		int region_id, struct resource *res, unsigned int align,
+struct inode *dax_inode(struct dax_device *dax_dev);
-		void *addr, unsigned long flags);
+void *dax_get_private(struct dax_device *dax_dev);
-struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count);
 #endif /* __DAX_H__ */
--- a/drivers/dax/device-dax.h
+++ b/drivers/dax/device-dax.h
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DEVICE_DAX_H__
+#define __DEVICE_DAX_H__
+struct device;
+struct dev_dax;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+		int region_id, struct resource *res, unsigned int align,
+		void *addr, unsigned long flags);
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
+		struct resource *res, int count);
+#endif /* __DEVICE_DAX_H__ */
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -16,7 +16,7 @@
 #include <linux/pfn_t.h>
 #include "../nvdimm/pfn.h"
 #include "../nvdimm/nd.h"
-#include "dax.h"
+#include "device-dax.h"
 struct dax_pmem {
 	struct device *dev;

--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/cdev.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+static int nr_dax = CONFIG_NR_DEV_DAX;
+module_param(nr_dax, int, S_IRUGO);
+MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
+static dev_t dax_devt;
+DEFINE_STATIC_SRCU(dax_srcu);
+static struct vfsmount *dax_mnt;
+static DEFINE_IDA(dax_minor_ida);
+static struct kmem_cache *dax_cache __read_mostly;
+static struct super_block *dax_superblock __read_mostly;
+int dax_read_lock(void)
+{
+	return srcu_read_lock(&dax_srcu);
+}
+EXPORT_SYMBOL_GPL(dax_read_lock);
+void dax_read_unlock(int id)
+{
+	srcu_read_unlock(&dax_srcu, id);
+}
+EXPORT_SYMBOL_GPL(dax_read_unlock);
+/**
+ * struct dax_device - anchor object for dax services
+ * @inode: core vfs
+ * @cdev: optional character interface for "device dax"
+ * @private: dax driver private data
+ * @alive: !alive + rcu grace period == no new operations / mappings
+ */
+struct dax_device {
+	struct inode inode;
+	struct cdev cdev;
+	void *private;
+	bool alive;
+};
+bool dax_alive(struct dax_device *dax_dev)
+{
+	lockdep_assert_held(&dax_srcu);
+	return dax_dev->alive;
+}
+EXPORT_SYMBOL_GPL(dax_alive);
+/*
+ * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
+ * that any fault handlers or operations that might have seen
+ * dax_alive(), have completed.  Any operations that start after
+ * synchronize_srcu() has run will abort upon seeing !dax_alive().
+ */
+void kill_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+	dax_dev->alive = false;
+	synchronize_srcu(&dax_srcu);
+	dax_dev->private = NULL;
+}
+EXPORT_SYMBOL_GPL(kill_dax);
+static struct inode *dax_alloc_inode(struct super_block *sb)
+{
+	struct dax_device *dax_dev;
+	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+	return &dax_dev->inode;
+}
+static struct dax_device *to_dax_dev(struct inode *inode)
+{
+	return container_of(inode, struct dax_device, inode);
+}
+static void dax_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct dax_device *dax_dev = to_dax_dev(inode);
+	ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
+	kmem_cache_free(dax_cache, dax_dev);
+}
+static void dax_destroy_inode(struct inode *inode)
+{
+	struct dax_device *dax_dev = to_dax_dev(inode);
+	WARN_ONCE(dax_dev->alive,
+			"kill_dax() must be called before final iput()\n");
+	call_rcu(&inode->i_rcu, dax_i_callback);
+}
+static const struct super_operations dax_sops = {
+	.statfs = simple_statfs,
+	.alloc_inode = dax_alloc_inode,
+	.destroy_inode = dax_destroy_inode,
+	.drop_inode = generic_delete_inode,
+};
+static struct dentry *dax_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+}
+static struct file_system_type dax_fs_type = {
+	.name = "dax",
+	.mount = dax_mount,
+	.kill_sb = kill_anon_super,
+};
+static int dax_test(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+	return inode->i_rdev == devt;
+}
+static int dax_set(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+	inode->i_rdev = devt;
+	return 0;
+}
+static struct dax_device *dax_dev_get(dev_t devt)
+{
+	struct dax_device *dax_dev;
+	struct inode *inode;
+	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
+			dax_test, dax_set, &devt);
+	if (!inode)
+		return NULL;
+	dax_dev = to_dax_dev(inode);
+	if (inode->i_state & I_NEW) {
+		dax_dev->alive = true;
+		inode->i_cdev = &dax_dev->cdev;
+		inode->i_mode = S_IFCHR;
+		inode->i_flags = S_DAX;
+		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+		unlock_new_inode(inode);
+	}
+	return dax_dev;
+}
+struct dax_device *alloc_dax(void *private)
+{
+	struct dax_device *dax_dev;
+	dev_t devt;
+	int minor;
+	minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
+	if (minor < 0)
+		return NULL;
+	devt = MKDEV(MAJOR(dax_devt), minor);
+	dax_dev = dax_dev_get(devt);
+	if (!dax_dev)
+		goto err_inode;
+	dax_dev->private = private;
+	return dax_dev;
+ err_inode:
+	ida_simple_remove(&dax_minor_ida, minor);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_dax);
+void put_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+	iput(&dax_dev->inode);
+}
+EXPORT_SYMBOL_GPL(put_dax);
+/**
+ * inode_dax: convert a public inode into its dax_dev
+ * @inode: An inode with i_cdev pointing to a dax_dev
+ *
+ * Note this is not equivalent to to_dax_dev() which is for private
+ * internal use where we know the inode filesystem type == dax_fs_type.
+ */
+struct dax_device *inode_dax(struct inode *inode)
+{
+	struct cdev *cdev = inode->i_cdev;
+	return container_of(cdev, struct dax_device, cdev);
+}
+EXPORT_SYMBOL_GPL(inode_dax);
+struct inode *dax_inode(struct dax_device *dax_dev)
+{
+	return &dax_dev->inode;
+}
+EXPORT_SYMBOL_GPL(dax_inode);
+void *dax_get_private(struct dax_device *dax_dev)
+{
+	return dax_dev->private;
+}
+EXPORT_SYMBOL_GPL(dax_get_private);
+static void init_once(void *_dax_dev)
+{
+	struct dax_device *dax_dev = _dax_dev;
+	struct inode *inode = &dax_dev->inode;
+	inode_init_once(inode);
+}
+static int __dax_fs_init(void)
+{
+	int rc;
+	dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+			init_once);
+	if (!dax_cache)
+		return -ENOMEM;
+	rc = register_filesystem(&dax_fs_type);
+	if (rc)
+		goto err_register_fs;
+	dax_mnt = kern_mount(&dax_fs_type);
+	if (IS_ERR(dax_mnt)) {
+		rc = PTR_ERR(dax_mnt);
+		goto err_mount;
+	}
+	dax_superblock = dax_mnt->mnt_sb;
+	return 0;
+ err_mount:
+	unregister_filesystem(&dax_fs_type);
+ err_register_fs:
+	kmem_cache_destroy(dax_cache);
+	return rc;
+}
+static void __dax_fs_exit(void)
+{
+	kern_unmount(dax_mnt);
+	unregister_filesystem(&dax_fs_type);
+	kmem_cache_destroy(dax_cache);
+}
+static int __init dax_fs_init(void)
+{
+	int rc;
+	rc = __dax_fs_init();
+	if (rc)
+		return rc;
+	nr_dax = max(nr_dax, 256);
+	rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
+	if (rc)
+		__dax_fs_exit();
+	return rc;
+}
+static void __exit dax_fs_exit(void)
+{
+	unregister_chrdev_region(dax_devt, nr_dax);
+	ida_destroy(&dax_minor_ida);
+	__dax_fs_exit();
+}
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_fs_init);
+module_exit(dax_fs_exit);
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,6 +8,9 @@
 struct iomap_ops;
+int dax_read_lock(void);
+void dax_read_unlock(int id);
 /*
 * We use lowest available bit in exceptional entry for locking, one bit for
 * the entry size (PMD) and two more to tell us if the entry is a huge zero

--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
-obj-$(CONFIG_DEV_DAX) += dax.o
+ifeq ($(CONFIG_DAX),m)
+obj-$(CONFIG_DAX) += dax.o
+endif
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 nfit-y := $(ACPI_SRC)/core.o
@@ -48,9 +51,12 @@ nd_blk-y += config_check.o
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
-dax-y := $(DAX_SRC)/dax.o
+dax-y := $(DAX_SRC)/super.o
 dax-y += config_check.o
+device_dax-y := $(DAX_SRC)/device.o
+device_dax-y += config_check.o
 dax_pmem-y := $(DAX_SRC)/pmem.o
 dax_pmem-y += config_check.o