• Rob Herring's avatar
    ARM: 7493/1: use generic unaligned.h · d25c881a
    Rob Herring authored
    This moves ARM over to the asm-generic/unaligned.h header. This has the
    benefit of better code generated especially for ARMv7 on gcc 4.7+
    compilers.
    
    As Arnd Bergmann, points out: The asm-generic version uses the "struct"
    version for native-endian unaligned access and the "byteshift" version
    for the opposite endianess. The current ARM version however uses the
    "byteshift" implementation for both.
    
    Thanks to Nicolas Pitre for the excellent analysis:
    
    Test case:
    
    int foo (int *x) { return get_unaligned(x); }
    long long bar (long long *x) { return get_unaligned(x); }
    
    With the current ARM version:
    
    foo:
    	ldrb	r3, [r0, #2]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 2B], MEM[(const u8 *)x_1(D) + 2B]
    	ldrb	r1, [r0, #1]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 1B], MEM[(const u8 *)x_1(D) + 1B]
    	ldrb	r2, [r0, #0]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D)], MEM[(const u8 *)x_1(D)]
    	mov	r3, r3, asl #16	@ tmp154, MEM[(const u8 *)x_1(D) + 2B],
    	ldrb	r0, [r0, #3]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 3B], MEM[(const u8 *)x_1(D) + 3B]
    	orr	r3, r3, r1, asl #8	@, tmp155, tmp154, MEM[(const u8 *)x_1(D) + 1B],
    	orr	r3, r3, r2	@ tmp157, tmp155, MEM[(const u8 *)x_1(D)]
    	orr	r0, r3, r0, asl #24	@,, tmp157, MEM[(const u8 *)x_1(D) + 3B],
    	bx	lr	@
    
    bar:
    	stmfd	sp!, {r4, r5, r6, r7}	@,
    	mov	r2, #0	@ tmp184,
    	ldrb	r5, [r0, #6]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 6B], MEM[(const u8 *)x_1(D) + 6B]
    	ldrb	r4, [r0, #5]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 5B], MEM[(const u8 *)x_1(D) + 5B]
    	ldrb	ip, [r0, #2]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 2B], MEM[(const u8 *)x_1(D) + 2B]
    	ldrb	r1, [r0, #4]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 4B], MEM[(const u8 *)x_1(D) + 4B]
    	mov	r5, r5, asl #16	@ tmp175, MEM[(const u8 *)x_1(D) + 6B],
    	ldrb	r7, [r0, #1]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 1B], MEM[(const u8 *)x_1(D) + 1B]
    	orr	r5, r5, r4, asl #8	@, tmp176, tmp175, MEM[(const u8 *)x_1(D) + 5B],
    	ldrb	r6, [r0, #7]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 7B], MEM[(const u8 *)x_1(D) + 7B]
    	orr	r5, r5, r1	@ tmp178, tmp176, MEM[(const u8 *)x_1(D) + 4B]
    	ldrb	r4, [r0, #0]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D)], MEM[(const u8 *)x_1(D)]
    	mov	ip, ip, asl #16	@ tmp188, MEM[(const u8 *)x_1(D) + 2B],
    	ldrb	r1, [r0, #3]	@ zero_extendqisi2	@ MEM[(const u8 *)x_1(D) + 3B], MEM[(const u8 *)x_1(D) + 3B]
    	orr	ip, ip, r7, asl #8	@, tmp189, tmp188, MEM[(const u8 *)x_1(D) + 1B],
    	orr	r3, r5, r6, asl #24	@,, tmp178, MEM[(const u8 *)x_1(D) + 7B],
    	orr	ip, ip, r4	@ tmp191, tmp189, MEM[(const u8 *)x_1(D)]
    	orr	ip, ip, r1, asl #24	@, tmp194, tmp191, MEM[(const u8 *)x_1(D) + 3B],
    	mov	r1, r3	@,
    	orr	r0, r2, ip	@ tmp171, tmp184, tmp194
    	ldmfd	sp!, {r4, r5, r6, r7}
    	bx	lr
    
    In both cases the code is slightly suboptimal.  One may wonder why
    wasting r2 with the constant 0 in the second case for example.  And all
    the mov's could be folded in subsequent orr's, etc.
    
    Now with the asm-generic version:
    
    foo:
    	ldr	r0, [r0, #0]	@ unaligned	@,* x
    	bx	lr	@
    
    bar:
    	mov	r3, r0	@ x, x
    	ldr	r0, [r0, #0]	@ unaligned	@,* x
    	ldr	r1, [r3, #4]	@ unaligned	@,
    	bx	lr	@
    
    This is way better of course, but only because this was compiled for
    ARMv7. In this case the compiler knows that the hardware can do
    unaligned word access.  This isn't that obvious for foo(), but if we
    remove the get_unaligned() from bar as follows:
    
    long long bar (long long *x) {return *x; }
    
    then the resulting code is:
    
    bar:
    	ldmia	r0, {r0, r1}	@ x,,
    	bx	lr	@
    
    So this proves that the presumed aligned vs unaligned cases does have
    influence on the instructions the compiler may use and that the above
    unaligned code results are not just an accident.
    
    Still... this isn't fully conclusive without at least looking at the
    resulting assembly fron a pre ARMv6 compilation.  Let's see with an
    ARMv5 target:
    
    foo:
    	ldrb	r3, [r0, #0]	@ zero_extendqisi2	@ tmp139,* x
    	ldrb	r1, [r0, #1]	@ zero_extendqisi2	@ tmp140,
    	ldrb	r2, [r0, #2]	@ zero_extendqisi2	@ tmp143,
    	ldrb	r0, [r0, #3]	@ zero_extendqisi2	@ tmp146,
    	orr	r3, r3, r1, asl #8	@, tmp142, tmp139, tmp140,
    	orr	r3, r3, r2, asl #16	@, tmp145, tmp142, tmp143,
    	orr	r0, r3, r0, asl #24	@,, tmp145, tmp146,
    	bx	lr	@
    
    bar:
    	stmfd	sp!, {r4, r5, r6, r7}	@,
    	ldrb	r2, [r0, #0]	@ zero_extendqisi2	@ tmp139,* x
    	ldrb	r7, [r0, #1]	@ zero_extendqisi2	@ tmp140,
    	ldrb	r3, [r0, #4]	@ zero_extendqisi2	@ tmp149,
    	ldrb	r6, [r0, #5]	@ zero_extendqisi2	@ tmp150,
    	ldrb	r5, [r0, #2]	@ zero_extendqisi2	@ tmp143,
    	ldrb	r4, [r0, #6]	@ zero_extendqisi2	@ tmp153,
    	ldrb	r1, [r0, #7]	@ zero_extendqisi2	@ tmp156,
    	ldrb	ip, [r0, #3]	@ zero_extendqisi2	@ tmp146,
    	orr	r2, r2, r7, asl #8	@, tmp142, tmp139, tmp140,
    	orr	r3, r3, r6, asl #8	@, tmp152, tmp149, tmp150,
    	orr	r2, r2, r5, asl #16	@, tmp145, tmp142, tmp143,
    	orr	r3, r3, r4, asl #16	@, tmp155, tmp152, tmp153,
    	orr	r0, r2, ip, asl #24	@,, tmp145, tmp146,
    	orr	r1, r3, r1, asl #24	@,, tmp155, tmp156,
    	ldmfd	sp!, {r4, r5, r6, r7}
    	bx	lr
    
    Compared to the initial results, this is really nicely optimized and I
    couldn't do much better if I were to hand code it myself.
    Signed-off-by: default avatarRob Herring <rob.herring@calxeda.com>
    Reviewed-by: default avatarNicolas Pitre <nico@linaro.org>
    Tested-by: default avatarThomas Petazzoni <thomas.petazzoni@free-electrons.com>
    Reviewed-by: default avatarArnd Bergmann <arnd@arndb.de>
    Signed-off-by: default avatarRussell King <rmk+kernel@arm.linux.org.uk>
    d25c881a
Kbuild 788 Bytes