Commit f768b35a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-6.3-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs percpu counter fixes from Darrick Wong:
 "We discovered a filesystem summary counter corruption problem that was
  traced to cpu hot-remove racing with the call to percpu_counter_sum
  that sets the free block count in the superblock when writing it to
  disk. The root cause is that percpu_counter_sum doesn't cull from
  dying cpus and hence misses those counter values if the cpu shutdown
  hooks have not yet run to merge the values.

  I'm hoping this is a fairly painless fix to the problem, since the
  dying cpu mask should generally be empty. It's been in for-next for a
  week without any complaints from the bots.

   - Fix a race in the percpu counters summation code where the
     summation failed to add in the values for any CPUs that were dying
     but not yet dead. This fixes some minor discrepancies and incorrect
     assertions when running generic/650"

* tag 'xfs-6.3-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  pcpcntr: remove percpu_counter_sum_all()
  fork: remove use of percpu_counter_sum_all
  pcpcntrs: fix dying cpu summation race
  cpumask: introduce for_each_cpu_or
parents d7044263 e9b60c7f
......@@ -350,6 +350,23 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
#define for_each_cpu_andnot(cpu, mask1, mask2) \
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
/**
* for_each_cpu_or - iterate over every cpu present in either mask
* @cpu: the (optionally unsigned) integer iterator
* @mask1: the first cpumask pointer
* @mask2: the second cpumask pointer
*
* This saves a temporary CPU mask in many places. It is equivalent to:
* struct cpumask tmp;
* cpumask_or(&tmp, &mask1, &mask2);
* for_each_cpu(cpu, &tmp)
* ...
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_or(cpu, mask1, mask2) \
for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
/**
* cpumask_any_but - return a "random" in a cpumask, but not this one.
* @mask: the cpumask to search
......
......@@ -14,6 +14,8 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long
unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
......@@ -127,6 +129,36 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1,
}
#endif
#ifndef find_next_or_bit
/**
* find_next_or_bit - find the next set bit in either memory regions
* @addr1: The first address to base the search on
* @addr2: The second address to base the search on
* @size: The bitmap size in bits
* @offset: The bitnumber to start searching at
*
* Returns the bit number for the next set bit
* If no bits are set, returns @size.
*/
static inline
unsigned long find_next_or_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size,
unsigned long offset)
{
if (small_const_nbits(size)) {
unsigned long val;
if (unlikely(offset >= size))
return size;
val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
return val ? __ffs(val) : size;
}
return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif
#ifndef find_next_zero_bit
/**
* find_next_zero_bit - find the next cleared bit in a memory region
......@@ -536,6 +568,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
(bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
(bit)++)
#define for_each_or_bit(bit, addr1, addr2, size) \
for ((bit) = 0; \
(bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
(bit)++)
/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
......
......@@ -45,7 +45,6 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
s64 percpu_counter_sum_all(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);
......@@ -196,11 +195,6 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return percpu_counter_read(fbc);
}
static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc)
{
return percpu_counter_read(fbc);
}
static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
return true;
......
......@@ -755,11 +755,6 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
if (likely(!x))
continue;
/* Making sure this is not due to race with CPU offlining. */
x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
......
......@@ -182,6 +182,15 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l
EXPORT_SYMBOL(_find_next_andnot_bit);
#endif
#ifndef find_next_or_bit
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long nbits, unsigned long start)
{
return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_or_bit);
#endif
#ifndef find_next_zero_bit
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
unsigned long start)
......
......@@ -122,8 +122,19 @@ void percpu_counter_sync(struct percpu_counter *fbc)
}
EXPORT_SYMBOL(percpu_counter_sync);
static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc,
const struct cpumask *cpu_mask)
/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive().
*
* We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
* from CPUs that are in the process of being taken offline. Dying cpus have
* been removed from the online mask, but may not have had the hotplug dead
* notifier called to fold the percpu count back into the global counter sum.
* By including dying CPUs in the iteration mask, we avoid this race condition
* so __percpu_counter_sum() just does the right thing when CPUs are being taken
* offline.
*/
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
s64 ret;
int cpu;
......@@ -131,35 +142,15 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc,
raw_spin_lock_irqsave(&fbc->lock, flags);
ret = fbc->count;
for_each_cpu(cpu, cpu_mask) {
for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
raw_spin_unlock_irqrestore(&fbc->lock, flags);
return ret;
}
/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
return __percpu_counter_sum_mask(fbc, cpu_online_mask);
}
EXPORT_SYMBOL(__percpu_counter_sum);
/*
* This is slower version of percpu_counter_sum as it traverses all possible
* cpus. Use this only in the cases where accurate data is needed in the
* presense of CPUs getting offlined.
*/
s64 percpu_counter_sum_all(struct percpu_counter *fbc)
{
return __percpu_counter_sum_mask(fbc, cpu_possible_mask);
}
EXPORT_SYMBOL(percpu_counter_sum_all);
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
struct lock_class_key *key)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment