From 58cef2ea14e2ae91af3c0910029570beff88e687 Mon Sep 17 00:00:00 2001 From: David Mosberger <davidm@wailua.hpl.hp.com> Date: Tue, 2 Apr 2002 03:14:43 -0800 Subject: [PATCH] arch/ia64/lib/copy_page_mck.S: Tweak for better performance when data is in L2 or L3 cache. --- arch/ia64/lib/copy_page_mck.S | 53 ++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S index 49b381c98306..59d293314625 100644 --- a/arch/ia64/lib/copy_page_mck.S +++ b/arch/ia64/lib/copy_page_mck.S @@ -17,28 +17,28 @@ * cycle * * Principle of operation: + * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. + * To avoid secondary misses in L2, we prefetch both source and destination with a line-size + * of 128 bytes. When both of these lines are in the L2 and the first half of the + * source line is in L1, we start copying the remaining words. The second half of the + * source line is prefetched in an earlier iteration, so that by the time we start + * accessing it, it's also present in the L1. + * * We use a software-pipelined loop to control the overall operation. The pipeline - * has 2*PREFETCH_DIST+2 stages. The first PREFETCH_DIST stages are used for prefetching + * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination - * cache-lines, the two last stages are used to copy the cache-line words not copied by + * cache-lines, the last K stages are used to copy the cache-line words not copied by * the prefetches. The four relevant points in the pipelined are called A, B, C, D: * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line - * should be prefetched, p[C] is TRUE if at least one more cacheline needs to be copied, - * and p[D] is TRUE if a cachline needs to be copied. - * - * Note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. To avoid - * secondary misses in L2, we prefetch both source and destination with a line-size - * of 128 bytes. When both of these lines are in the L2 and the first half of the - * source line is in L1, we start copying the remaining words. The second half of the - * source line is prefetched in the previous iteration, so that by the time we start - * accessing it, it's also present in the L1. + * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought + * into L1D and p[D] is TRUE if a cacheline needs to be copied. * * This all sounds very complicated, but thanks to the modulo-scheduled loop support, * the resulting code is very regular and quite easy to follow (once you get the idea). * * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented * as the separate .prefetch_loop. Logically, this loop performs exactly like the - * main-loop (.line_copy), but has all know-to-be-predicated-off instructions removed, + * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed, * so that each loop iteration is faster (again, good for cached case). * * When reading the code, it helps to keep the following picture in mind: @@ -49,13 +49,13 @@ * | t2 | t3 | | * | t4 | t5 | | * | t6 | t7 | | 128 bytes - * | n8 | t9 | | (L2 cache line) + * | n[y] | t9 | | (L2 cache line) * | t10 | t11 | | * | t12 | t13 | | * | t14 | t15 | v * +------+------+--- * - * Here, v[x] is copied by the (memory) prefetch. n8 is loaded in the previous iteration + * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C] * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in * an order that avoids bank conflicts. */ @@ -79,22 +79,21 @@ #define t5 t1 // alias! #define t6 t2 // alias! #define t7 t3 // alias! -#define n8 r21 #define t9 t5 // alias! #define t10 t4 // alias! #define t11 t7 // alias! #define t12 t6 // alias! #define t14 t10 // alias! -#define t13 r22 -#define t15 r23 +#define t13 r21 +#define t15 r22 -#define saved_lc r24 -#define saved_pr r25 +#define saved_lc r23 +#define saved_pr r24 #define A 0 #define B (PREFETCH_DIST) #define C (B + PREFETCH_DIST) -#define D (C + 1) +#define D (C + 3) #define N (D + 1) #define Nrot ((N + 7) & ~7) @@ -102,7 +101,7 @@ GLOBAL_ENTRY(copy_page) .prologue alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot - .rotr v[2*PREFETCH_DIST] + .rotr v[2*PREFETCH_DIST], n[D-C+1] .rotp p[N] .save ar.lc, saved_lc @@ -124,6 +123,9 @@ GLOBAL_ENTRY(copy_page) add src1 = 3*8, in1 // first t3 src add dst0 = 8, in0 // first t1 dst add dst1 = 3*8, in0 // first t3 dst + nop.m 0 + nop.m 0 + nop.i 0 ;; // same as .line_copy loop, but with all predicated-off instructions removed: .prefetch_loop: @@ -135,15 +137,14 @@ GLOBAL_ENTRY(copy_page) mov ar.lc = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1 mov ar.ec = N // # of stages in pipeline ;; - .align 32 .line_copy: (p[D]) ld8 t2 = [src0], 3*8 // M0 (p[D]) ld8 t4 = [src1], 3*8 // M1 (p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory -(p[D]) st8 [dst_pre_l2] = n8, 128 // M3 prefetch dst from L2 +(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2 ;; (p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory -(p[C]) ld8 n8 = [src_pre_l2], 128 // M1 prefetch src from L2 +(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2 (p[D]) st8 [dst0] = t1, 8 // M2 (p[D]) st8 [dst1] = t3, 8 // M3 ;; @@ -172,8 +173,8 @@ GLOBAL_ENTRY(copy_page) (p[D]) st8 [dst0] = t12, 8 (p[D]) st8 [dst1] = t14, 8 ;; -(p[C]) ld8 t1 = [src0], 8 -(p[C]) ld8 t3 = [src1], 8 +(p[D-1])ld8 t1 = [src0], 8 +(p[D-1])ld8 t3 = [src1], 8 (p[D]) st8 [dst0] = t13, 4*8 (p[D]) st8 [dst1] = t15, 4*8 br.ctop.sptk .line_copy -- 2.30.9