Commit 335ebe30 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Catalin Marinas

lib/raid6: arm: optimize away a mask operation in NEON recovery routine

The NEON recovery code was modeled after the x86 SIMD code, and for
some reason, that code uses a 16 bit wide signed shift and a mask to
perform what amounts to a 8 bit unsigned shift. So fold the ops
together.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent 1ad3935b
...@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, ...@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); vy = vshrq_n_u8(vx, 4);
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); vy = vqtbl1q_u8(qm1, vy);
qx = veorq_u8(vx, vy); qx = veorq_u8(vx, vy);
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4); vy = vshrq_n_u8(px, 4);
vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f)); vy = vqtbl1q_u8(pm1, vy);
vx = veorq_u8(vx, vy); vx = veorq_u8(vx, vy);
db = veorq_u8(vx, qx); db = veorq_u8(vx, qx);
...@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, ...@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); vy = vshrq_n_u8(vx, 4);
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); vy = vqtbl1q_u8(qm1, vy);
vx = veorq_u8(vx, vy); vx = veorq_u8(vx, vy);
vy = veorq_u8(vx, vld1q_u8(p)); vy = veorq_u8(vx, vld1q_u8(p));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment