Commit d5ff0814 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull nvdimm fixes from Dan Williams:
 "A small crop of lockdep, sleeping while atomic, and other fixes /
  band-aids in advance of the full-blown reworks targeting the next
  merge window. The largest change here is "libnvdimm: fix blk free
  space accounting" which deletes a pile of buggy code that better
  testing would have caught before merging. The next change that is
  borderline too big for a late rc is switching the device-dax locking
  from rcu to srcu, I couldn't think of a smaller way to make that fix.

  The __copy_user_nocache fix will have a full replacement in 4.12 to
  move those pmem special case considerations into the pmem driver. The
  "libnvdimm: band aid btt vs clear poison locking" commit admits that
  our error clearing support for btt went in broken, so we just disable
  it in 4.11 and -stable. A replacement / full fix is in the pipeline
  for 4.12

  Some of these would have been caught earlier had DEBUG_ATOMIC_SLEEP
  been enabled on my development station. I wonder if we should have:

      config DEBUG_ATOMIC_SLEEP
        default PROVE_LOCKING

  ...since I mistakenly thought I got both with PROVE_LOCKING=y.

  These have received a build success notification from the 0day robot,
  and some have appeared in a -next release with no reported issues"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions
  device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation
  libnvdimm: band aid btt vs clear poison locking
  libnvdimm: fix reconfig_mutex, mmap_sem, and jbd2_handle lockdep splat
  libnvdimm: fix blk free space accounting
  acpi, nfit, libnvdimm: fix interleave set cookie calculation (64-bit comparison)
parents 403a39f8 11e63f6d
...@@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) ...@@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
* @size: number of bytes to write back * @size: number of bytes to write back
* *
* Write back a cache range using the CLWB (cache line write back) * Write back a cache range using the CLWB (cache line write back)
* instruction. * instruction. Note that @size is internally rounded up to be cache
* line size aligned.
*/ */
static inline void arch_wb_cache_pmem(void *addr, size_t size) static inline void arch_wb_cache_pmem(void *addr, size_t size)
{ {
...@@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) ...@@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size)
clwb(p); clwb(p);
} }
/*
* copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
* iterators, so for other types (bvec & kvec) we must do a cache write-back.
*/
static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
{
return iter_is_iovec(i) == false;
}
/** /**
* arch_copy_from_iter_pmem - copy data from an iterator to PMEM * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
* @addr: PMEM destination address * @addr: PMEM destination address
...@@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, ...@@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
/* TODO: skip the write-back by always using non-temporal stores */ /* TODO: skip the write-back by always using non-temporal stores */
len = copy_from_iter_nocache(addr, bytes, i); len = copy_from_iter_nocache(addr, bytes, i);
if (__iter_needs_pmem_wb(i)) /*
* In the iovec case on x86_64 copy_from_iter_nocache() uses
* non-temporal stores for the bulk of the transfer, but we need
* to manually flush if the transfer is unaligned. A cached
* memory copy is used when destination or size is not naturally
* aligned. That is:
* - Require 8-byte alignment when size is 8 bytes or larger.
* - Require 4-byte alignment when size is 4 bytes.
*
* In the non-iovec case the entire destination needs to be
* flushed.
*/
if (iter_is_iovec(i)) {
unsigned long flushed, dest = (unsigned long) addr;
if (bytes < 8) {
if (!IS_ALIGNED(dest, 4) || (bytes != 4))
arch_wb_cache_pmem(addr, 1);
} else {
if (!IS_ALIGNED(dest, 8)) {
dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
arch_wb_cache_pmem(addr, 1);
}
flushed = dest - (unsigned long) addr;
if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
arch_wb_cache_pmem(addr + bytes - 1, 1);
}
} else
arch_wb_cache_pmem(addr, bytes); arch_wb_cache_pmem(addr, bytes);
return len; return len;
......
...@@ -1617,7 +1617,11 @@ static int cmp_map(const void *m0, const void *m1) ...@@ -1617,7 +1617,11 @@ static int cmp_map(const void *m0, const void *m1)
const struct nfit_set_info_map *map0 = m0; const struct nfit_set_info_map *map0 = m0;
const struct nfit_set_info_map *map1 = m1; const struct nfit_set_info_map *map1 = m1;
return map0->region_offset - map1->region_offset; if (map0->region_offset < map1->region_offset)
return -1;
else if (map0->region_offset > map1->region_offset)
return 1;
return 0;
} }
/* Retrieve the nth entry referencing this spa */ /* Retrieve the nth entry referencing this spa */
......
...@@ -2,6 +2,7 @@ menuconfig DEV_DAX ...@@ -2,6 +2,7 @@ menuconfig DEV_DAX
tristate "DAX: direct access to differentiated memory" tristate "DAX: direct access to differentiated memory"
default m if NVDIMM_DAX default m if NVDIMM_DAX
depends on TRANSPARENT_HUGEPAGE depends on TRANSPARENT_HUGEPAGE
select SRCU
help help
Support raw access to differentiated (persistence, bandwidth, Support raw access to differentiated (persistence, bandwidth,
latency...) memory via an mmap(2) capable character latency...) memory via an mmap(2) capable character
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "dax.h" #include "dax.h"
static dev_t dax_devt; static dev_t dax_devt;
DEFINE_STATIC_SRCU(dax_srcu);
static struct class *dax_class; static struct class *dax_class;
static DEFINE_IDA(dax_minor_ida); static DEFINE_IDA(dax_minor_ida);
static int nr_dax = CONFIG_NR_DEV_DAX; static int nr_dax = CONFIG_NR_DEV_DAX;
...@@ -60,7 +61,7 @@ struct dax_region { ...@@ -60,7 +61,7 @@ struct dax_region {
* @region - parent region * @region - parent region
* @dev - device backing the character device * @dev - device backing the character device
* @cdev - core chardev data * @cdev - core chardev data
* @alive - !alive + rcu grace period == no new mappings can be established * @alive - !alive + srcu grace period == no new mappings can be established
* @id - child id in the region * @id - child id in the region
* @num_resources - number of physical address extents in this device * @num_resources - number of physical address extents in this device
* @res - array of physical address ranges * @res - array of physical address ranges
...@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) ...@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
static int dax_dev_huge_fault(struct vm_fault *vmf, static int dax_dev_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size) enum page_entry_size pe_size)
{ {
int rc; int rc, id;
struct file *filp = vmf->vma->vm_file; struct file *filp = vmf->vma->vm_file;
struct dax_dev *dax_dev = filp->private_data; struct dax_dev *dax_dev = filp->private_data;
...@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ...@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
? "write" : "read", ? "write" : "read",
vmf->vma->vm_start, vmf->vma->vm_end); vmf->vma->vm_start, vmf->vma->vm_end);
rcu_read_lock(); id = srcu_read_lock(&dax_srcu);
switch (pe_size) { switch (pe_size) {
case PE_SIZE_PTE: case PE_SIZE_PTE:
rc = __dax_dev_pte_fault(dax_dev, vmf); rc = __dax_dev_pte_fault(dax_dev, vmf);
...@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ...@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
default: default:
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
rcu_read_unlock(); srcu_read_unlock(&dax_srcu, id);
return rc; return rc;
} }
...@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev) ...@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev)
* Note, rcu is not protecting the liveness of dax_dev, rcu is * Note, rcu is not protecting the liveness of dax_dev, rcu is
* ensuring that any fault handlers that might have seen * ensuring that any fault handlers that might have seen
* dax_dev->alive == true, have completed. Any fault handlers * dax_dev->alive == true, have completed. Any fault handlers
* that start after synchronize_rcu() has started will abort * that start after synchronize_srcu() has started will abort
* upon seeing dax_dev->alive == false. * upon seeing dax_dev->alive == false.
*/ */
dax_dev->alive = false; dax_dev->alive = false;
synchronize_rcu(); synchronize_srcu(&dax_srcu);
unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
cdev_del(cdev); cdev_del(cdev);
device_unregister(dev); device_unregister(dev);
......
...@@ -934,8 +934,14 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, ...@@ -934,8 +934,14 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL);
if (rc < 0) if (rc < 0)
goto out_unlock; goto out_unlock;
nvdimm_bus_unlock(&nvdimm_bus->dev);
if (copy_to_user(p, buf, buf_len)) if (copy_to_user(p, buf, buf_len))
rc = -EFAULT; rc = -EFAULT;
vfree(buf);
return rc;
out_unlock: out_unlock:
nvdimm_bus_unlock(&nvdimm_bus->dev); nvdimm_bus_unlock(&nvdimm_bus->dev);
out: out:
......
...@@ -243,7 +243,15 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, ...@@ -243,7 +243,15 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
} }
if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) { /*
* FIXME: nsio_rw_bytes() may be called from atomic
* context in the btt case and nvdimm_clear_poison()
* takes a sleeping lock. Until the locking can be
* reworked this capability requires that the namespace
* is not claimed by btt.
*/
if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
&& (!ndns->claim || !is_nd_btt(ndns->claim))) {
long cleared; long cleared;
cleared = nvdimm_clear_poison(&ndns->dev, offset, size); cleared = nvdimm_clear_poison(&ndns->dev, offset, size);
......
...@@ -395,7 +395,7 @@ EXPORT_SYMBOL_GPL(nvdimm_create); ...@@ -395,7 +395,7 @@ EXPORT_SYMBOL_GPL(nvdimm_create);
int alias_dpa_busy(struct device *dev, void *data) int alias_dpa_busy(struct device *dev, void *data)
{ {
resource_size_t map_end, blk_start, new, busy; resource_size_t map_end, blk_start, new;
struct blk_alloc_info *info = data; struct blk_alloc_info *info = data;
struct nd_mapping *nd_mapping; struct nd_mapping *nd_mapping;
struct nd_region *nd_region; struct nd_region *nd_region;
...@@ -436,29 +436,19 @@ int alias_dpa_busy(struct device *dev, void *data) ...@@ -436,29 +436,19 @@ int alias_dpa_busy(struct device *dev, void *data)
retry: retry:
/* /*
* Find the free dpa from the end of the last pmem allocation to * Find the free dpa from the end of the last pmem allocation to
* the end of the interleave-set mapping that is not already * the end of the interleave-set mapping.
* covered by a blk allocation.
*/ */
busy = 0;
for_each_dpa_resource(ndd, res) { for_each_dpa_resource(ndd, res) {
if (strncmp(res->name, "pmem", 4) != 0)
continue;
if ((res->start >= blk_start && res->start < map_end) if ((res->start >= blk_start && res->start < map_end)
|| (res->end >= blk_start || (res->end >= blk_start
&& res->end <= map_end)) { && res->end <= map_end)) {
if (strncmp(res->name, "pmem", 4) == 0) { new = max(blk_start, min(map_end + 1, res->end + 1));
new = max(blk_start, min(map_end + 1,
res->end + 1));
if (new != blk_start) { if (new != blk_start) {
blk_start = new; blk_start = new;
goto retry; goto retry;
} }
} else
busy += min(map_end, res->end)
- max(nd_mapping->start, res->start) + 1;
} else if (nd_mapping->start > res->start
&& map_end < res->end) {
/* total eclipse of the PMEM region mapping */
busy += nd_mapping->size;
break;
} }
} }
...@@ -470,50 +460,9 @@ int alias_dpa_busy(struct device *dev, void *data) ...@@ -470,50 +460,9 @@ int alias_dpa_busy(struct device *dev, void *data)
return 1; return 1;
} }
info->available -= blk_start - nd_mapping->start + busy; info->available -= blk_start - nd_mapping->start;
return 0;
}
static int blk_dpa_busy(struct device *dev, void *data)
{
struct blk_alloc_info *info = data;
struct nd_mapping *nd_mapping;
struct nd_region *nd_region;
resource_size_t map_end;
int i;
if (!is_nd_pmem(dev))
return 0;
nd_region = to_nd_region(dev);
for (i = 0; i < nd_region->ndr_mappings; i++) {
nd_mapping = &nd_region->mapping[i];
if (nd_mapping->nvdimm == info->nd_mapping->nvdimm)
break;
}
if (i >= nd_region->ndr_mappings)
return 0;
map_end = nd_mapping->start + nd_mapping->size - 1;
if (info->res->start >= nd_mapping->start
&& info->res->start < map_end) {
if (info->res->end <= map_end) {
info->busy = 0;
return 1;
} else {
info->busy -= info->res->end - map_end;
return 0;
}
} else if (info->res->end >= nd_mapping->start
&& info->res->end <= map_end) {
info->busy -= nd_mapping->start - info->res->start;
return 0;
} else {
info->busy -= nd_mapping->size;
return 0; return 0;
}
} }
/** /**
...@@ -545,11 +494,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) ...@@ -545,11 +494,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
for_each_dpa_resource(ndd, res) { for_each_dpa_resource(ndd, res) {
if (strncmp(res->name, "blk", 3) != 0) if (strncmp(res->name, "blk", 3) != 0)
continue; continue;
info.available -= resource_size(res);
info.res = res;
info.busy = resource_size(res);
device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy);
info.available -= info.busy;
} }
return info.available; return info.available;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment