Commit 638c62ac authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-34983: Remove x86 asm from InnoDB

Starting with GCC 7 and clang 15, single-bit operations such as
fetch_or(1) & 1 are translated into 80386 instructions such as
LOCK BTS, instead of using the generic translation pattern
of emitting a loop around LOCK CMPXCHG.

Given that the oldest currently supported GNU/Linux distributions
ship GCC 7, and that older versions of GCC are out of support,
let us remove some work-arounds that are not strictly necessary.
If someone compiles the code using an older compiler, it will work
but possibly less efficiently.

srw_mutex_impl::HOLDER: Changed from 1U<<31 to 1 in order to
work around https://github.com/llvm/llvm-project/issues/37322
which is specific to setting the most significant bit.

srw_mutex_impl::WAITER: A multiplier of waiting requests.
This used to be 1, which would now collide with HOLDER.

fil_space_t::set_stopping(): Remove this unused function.

In MSVC we need _interlockedbittestandset() for LOCK BTS.
parent 71649b93
...@@ -528,9 +528,6 @@ struct fil_space_t final ...@@ -528,9 +528,6 @@ struct fil_space_t final
/** Close each file. Only invoked on fil_system.temp_space. */ /** Close each file. Only invoked on fil_system.temp_space. */
void close(); void close();
/** Note that operations on the tablespace must stop. */
inline void set_stopping();
/** Drop the tablespace and wait for any pending operations to cease /** Drop the tablespace and wait for any pending operations to cease
@param id tablespace identifier @param id tablespace identifier
@param detached_handle pointer to file to be closed later, or nullptr @param detached_handle pointer to file to be closed later, or nullptr
...@@ -589,32 +586,14 @@ struct fil_space_t final ...@@ -589,32 +586,14 @@ struct fil_space_t final
/** Clear the NEEDS_FSYNC flag */ /** Clear the NEEDS_FSYNC flag */
void clear_flush() void clear_flush()
{ {
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
__asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
_interlockedbittestandreset(reinterpret_cast<volatile long*>
(&n_pending), 28);
#else
n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
#endif
} }
private: private:
/** Clear the CLOSING flag */ /** Clear the CLOSING flag */
void clear_closing() void clear_closing()
{ {
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(CLOSING == 1U << 29, "compatibility");
__asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(CLOSING == 1U << 29, "compatibility");
_interlockedbittestandreset(reinterpret_cast<volatile long*>
(&n_pending), 29);
#else
n_pending.fetch_and(~CLOSING, std::memory_order_relaxed); n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
#endif
} }
/** @return pending operations (and flags) */ /** @return pending operations (and flags) */
...@@ -1605,21 +1584,6 @@ inline void fil_space_t::reacquire() ...@@ -1605,21 +1584,6 @@ inline void fil_space_t::reacquire()
#endif /* SAFE_MUTEX */ #endif /* SAFE_MUTEX */
} }
/** Note that operations on the tablespace must stop. */
inline void fil_space_t::set_stopping()
{
mysql_mutex_assert_owner(&fil_system.mutex);
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
__asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
_interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
#else
n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
#endif
}
/** Flush pending writes from the file system cache to the file. */ /** Flush pending writes from the file system cache to the file. */
template<bool have_reference> inline void fil_space_t::flush() template<bool have_reference> inline void fil_space_t::flush()
{ {
......
...@@ -39,15 +39,7 @@ class rw_lock ...@@ -39,15 +39,7 @@ class rw_lock
/** Start waiting for an exclusive lock. */ /** Start waiting for an exclusive lock. */
void write_lock_wait_start() void write_lock_wait_start()
{ {
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(WRITER_WAITING == 1U << 30, "compatibility");
__asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(WRITER_WAITING == 1U << 30, "compatibility");
_interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
#else
lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
#endif
} }
/** Start waiting for an exclusive lock. /** Start waiting for an exclusive lock.
@return current value of the lock word */ @return current value of the lock word */
......
...@@ -92,11 +92,13 @@ template<bool spinloop> ...@@ -92,11 +92,13 @@ template<bool spinloop>
class srw_mutex_impl final class srw_mutex_impl final
{ {
friend ssux_lock_impl<spinloop>; friend ssux_lock_impl<spinloop>;
/** The lock word, containing HOLDER + 1 if the lock is being held, /** The lock word, containing HOLDER + WAITER if the lock is being held,
plus the number of waiters */ plus WAITER times the number of waiters */
std::atomic<uint32_t> lock; std::atomic<uint32_t> lock;
/** Identifies that the lock is being held */ /** Identifies that the lock is being held */
static constexpr uint32_t HOLDER= 1U << 31; static constexpr uint32_t HOLDER= 1;
/** Identifies a lock waiter */
static constexpr uint32_t WAITER= 2;
#ifdef SUX_LOCK_GENERIC #ifdef SUX_LOCK_GENERIC
public: public:
...@@ -144,7 +146,7 @@ class srw_mutex_impl final ...@@ -144,7 +146,7 @@ class srw_mutex_impl final
bool wr_lock_try() bool wr_lock_try()
{ {
uint32_t lk= 0; uint32_t lk= 0;
return lock.compare_exchange_strong(lk, HOLDER + 1, return lock.compare_exchange_strong(lk, HOLDER + WAITER,
std::memory_order_acquire, std::memory_order_acquire,
std::memory_order_relaxed); std::memory_order_relaxed);
} }
...@@ -152,8 +154,9 @@ class srw_mutex_impl final ...@@ -152,8 +154,9 @@ class srw_mutex_impl final
void wr_lock() { if (!wr_lock_try()) wait_and_lock(); } void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
void wr_unlock() void wr_unlock()
{ {
const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release); const uint32_t lk=
if (lk != HOLDER + 1) lock.fetch_sub(HOLDER + WAITER, std::memory_order_release);
if (lk != HOLDER + WAITER)
{ {
DBUG_ASSERT(lk & HOLDER); DBUG_ASSERT(lk & HOLDER);
wake(); wake();
...@@ -269,10 +272,14 @@ class ssux_lock_impl ...@@ -269,10 +272,14 @@ class ssux_lock_impl
{ {
writer.wr_lock(); writer.wr_lock();
#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64 #if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
/* On IA-32 and AMD64, this type of fetch_or() can only be implemented /* On IA-32 and AMD64, a fetch_XXX() that needs to return the
as a loop around LOCK CMPXCHG. In this particular case, setting the previous value of the word state can only be implemented
most significant bit using fetch_add() is equivalent, and is efficiently for fetch_add() or fetch_sub(), both of which
translated into a simple LOCK XADD. */ translate into a 80486 LOCK XADD instruction. Anything else would
translate into a loop around LOCK CMPXCHG. In this particular
case, we know that the bit was previously clear, and therefore
setting (actually toggling) the most significant bit using
fetch_add() or fetch_sub() is equivalent. */
static_assert(WRITER == 1U << 31, "compatibility"); static_assert(WRITER == 1U << 31, "compatibility");
if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire)) if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
wr_wait(lk); wr_wait(lk);
......
...@@ -85,26 +85,12 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t ...@@ -85,26 +85,12 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
/** Set the SKIP bit */ /** Set the SKIP bit */
void ref_set_skip() void ref_set_skip()
{ {
static_assert(SKIP == 1U, "compatibility");
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
__asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
_interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
#else
ref.fetch_or(SKIP, std::memory_order_relaxed); ref.fetch_or(SKIP, std::memory_order_relaxed);
#endif
} }
/** Clear a bit in ref */ /** Clear a bit in ref */
void ref_reset_skip() void ref_reset_skip()
{ {
static_assert(SKIP == 1U, "compatibility");
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
__asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
_interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
#else
ref.fetch_and(~SKIP, std::memory_order_relaxed); ref.fetch_and(~SKIP, std::memory_order_relaxed);
#endif
} }
public: public:
......
...@@ -345,15 +345,7 @@ struct trx_lock_t ...@@ -345,15 +345,7 @@ struct trx_lock_t
/** Flag the lock owner as a victim in Galera conflict resolution. */ /** Flag the lock owner as a victim in Galera conflict resolution. */
void set_wsrep_victim() void set_wsrep_victim()
{ {
# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
/* There is no 8-bit version of the 80386 BTS instruction.
Technically, this is the wrong addressing mode (16-bit), but
there are other data members stored after the byte. */
__asm__ __volatile__("lock btsw $1, %0"
: "+m" (was_chosen_as_deadlock_victim));
# else
was_chosen_as_deadlock_victim.fetch_or(2); was_chosen_as_deadlock_victim.fetch_or(2);
# endif
} }
#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ #else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
...@@ -1038,15 +1030,7 @@ struct trx_t : ilist_node<> ...@@ -1038,15 +1030,7 @@ struct trx_t : ilist_node<>
void reset_skip_lock_inheritance() void reset_skip_lock_inheritance()
{ {
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
__asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
_interlockedbittestandreset(
reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref),
31);
#else
skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31); skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31);
#endif
} }
/** @return whether the table has lock on /** @return whether the table has lock on
......
...@@ -269,44 +269,10 @@ template void ssux_lock_impl<false>::wake(); ...@@ -269,44 +269,10 @@ template void ssux_lock_impl<false>::wake();
template void srw_mutex_impl<true>::wake(); template void srw_mutex_impl<true>::wake();
template void ssux_lock_impl<true>::wake(); template void ssux_lock_impl<true>::wake();
/*
Unfortunately, compilers targeting IA-32 or AMD64 currently cannot
translate the following single-bit operations into Intel 80386 instructions:
m.fetch_or(1<<b) & 1<<b LOCK BTS b, m
m.fetch_and(~(1<<b)) & 1<<b LOCK BTR b, m
m.fetch_xor(1<<b) & 1<<b LOCK BTC b, m
Hence, we will manually translate fetch_or() using GCC-style inline
assembler code or a Microsoft intrinsic function.
*/
#if defined __clang_major__ && __clang_major__ < 10
/* Only clang-10 introduced support for asm goto */
#elif defined __APPLE__
/* At least some versions of Apple Xcode do not support asm goto */
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
# define IF_FETCH_OR_GOTO(mem, bit, label) \
__asm__ goto("lock btsl $" #bit ", %0\n\t" \
"jc %l1" : : "m" (mem) : "cc", "memory" : label);
# define IF_NOT_FETCH_OR_GOTO(mem, bit, label) \
__asm__ goto("lock btsl $" #bit ", %0\n\t" \
"jnc %l1" : : "m" (mem) : "cc", "memory" : label);
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
# define IF_FETCH_OR_GOTO(mem, bit, label) \
if (_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit)) \
goto label;
# define IF_NOT_FETCH_OR_GOTO(mem, bit, label) \
if (!_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit))\
goto label;
#endif
template<bool spinloop> template<bool spinloop>
void srw_mutex_impl<spinloop>::wait_and_lock() void srw_mutex_impl<spinloop>::wait_and_lock()
{ {
uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed); uint32_t lk= WAITER + lock.fetch_add(WAITER, std::memory_order_relaxed);
if (spinloop) if (spinloop)
{ {
...@@ -318,10 +284,16 @@ void srw_mutex_impl<spinloop>::wait_and_lock() ...@@ -318,10 +284,16 @@ void srw_mutex_impl<spinloop>::wait_and_lock()
lk= lock.load(std::memory_order_relaxed); lk= lock.load(std::memory_order_relaxed);
if (!(lk & HOLDER)) if (!(lk & HOLDER))
{ {
#ifdef IF_NOT_FETCH_OR_GOTO #if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
static_assert(HOLDER == (1U << 31), "compatibility"); lk |= HOLDER;
IF_NOT_FETCH_OR_GOTO(*this, 31, acquired); # ifdef _MSC_VER
lk|= HOLDER; static_assert(HOLDER == (1U << 0), "compatibility");
if (!_interlockedbittestandset
(reinterpret_cast<volatile long*>(&lock), 0))
# else
if (!(lock.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER))
# endif
goto acquired;
#else #else
if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)) if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER))
goto acquired; goto acquired;
...@@ -339,16 +311,22 @@ void srw_mutex_impl<spinloop>::wait_and_lock() ...@@ -339,16 +311,22 @@ void srw_mutex_impl<spinloop>::wait_and_lock()
if (lk & HOLDER) if (lk & HOLDER)
{ {
wait(lk); wait(lk);
#ifdef IF_FETCH_OR_GOTO #if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
reload: reload:
#endif #endif
lk= lock.load(std::memory_order_relaxed); lk= lock.load(std::memory_order_relaxed);
} }
else else
{ {
#ifdef IF_FETCH_OR_GOTO #if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
static_assert(HOLDER == (1U << 31), "compatibility"); # ifdef _MSC_VER
IF_FETCH_OR_GOTO(*this, 31, reload); static_assert(HOLDER == (1U << 0), "compatibility");
if (_interlockedbittestandset
(reinterpret_cast<volatile long*>(&lock), 0))
# else
if (lock.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER)
# endif
goto reload;
#else #else
if ((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER) if ((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)
continue; continue;
...@@ -416,7 +394,8 @@ void ssux_lock_impl<spinloop>::rd_wait() ...@@ -416,7 +394,8 @@ void ssux_lock_impl<spinloop>::rd_wait()
/* Subscribe to writer.wake() or write.wake_all() calls by /* Subscribe to writer.wake() or write.wake_all() calls by
concurrently executing rd_wait() or writer.wr_unlock(). */ concurrently executing rd_wait() or writer.wr_unlock(). */
uint32_t wl= 1 + writer.lock.fetch_add(1, std::memory_order_acquire); uint32_t wl= writer.WAITER +
writer.lock.fetch_add(writer.WAITER, std::memory_order_acquire);
for (;;) for (;;)
{ {
...@@ -440,13 +419,13 @@ void ssux_lock_impl<spinloop>::rd_wait() ...@@ -440,13 +419,13 @@ void ssux_lock_impl<spinloop>::rd_wait()
} }
/* Unsubscribe writer.wake() and writer.wake_all(). */ /* Unsubscribe writer.wake() and writer.wake_all(). */
wl= writer.lock.fetch_sub(1, std::memory_order_release); wl= writer.lock.fetch_sub(writer.WAITER, std::memory_order_release);
ut_ad(wl); ut_ad(wl);
/* Wake any other threads that may be blocked in writer.wait(). /* Wake any other threads that may be blocked in writer.wait().
All other waiters than this rd_wait() would end up acquiring writer.lock All other waiters than this rd_wait() would end up acquiring writer.lock
and waking up other threads on unlock(). */ and waking up other threads on unlock(). */
if (wl > 1) if (wl > writer.WAITER)
writer.wake_all(); writer.wake_all();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment