Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
5b9c44cf
Commit
5b9c44cf
authored
Mar 26, 2002
by
David Mosberger
Browse files
Options
Browse Files
Download
Plain Diff
Merge wailua.hpl.hp.com:/bk/vanilla/linux-2.5
into wailua.hpl.hp.com:/bk/lia64/to-linus-2.5
parents
085c9a18
3c4cefb3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
188 additions
and
1 deletion
+188
-1
arch/ia64/lib/copy_page_mck.S
arch/ia64/lib/copy_page_mck.S
+184
-0
include/asm-ia64/thread_info.h
include/asm-ia64/thread_info.h
+4
-1
No files found.
arch/ia64/lib/copy_page_mck.S
0 → 100644
View file @
5b9c44cf
/*
*
McKinley
-
optimized
version
of
copy_page
()
.
*
*
Copyright
(
C
)
2002
Hewlett
-
Packard
Co
*
David
Mosberger
<
davidm
@
hpl
.
hp
.
com
>
*
*
Inputs
:
*
in0
:
address
of
target
page
*
in1
:
address
of
source
page
*
Output
:
*
no
return
value
*
*
General
idea
:
*
-
use
regular
loads
and
stores
to
prefetch
data
to
avoid
consuming
M
-
slot
just
for
*
lfetches
=>
good
for
in
-
cache
performance
*
-
avoid
l2
bank
-
conflicts
by
not
storing
into
the
same
16
-
byte
bank
within
a
single
*
cycle
*
*
Principle
of
operation
:
*
We
use
a
software
-
pipelined
loop
to
control
the
overall
operation
.
The
pipeline
*
has
2
*
PREFETCH_DIST
+
2
stages
.
The
first
PREFETCH_DIST
stages
are
used
for
prefetching
*
source
cache
-
lines
.
The
second
PREFETCH_DIST
stages
are
used
for
prefetching
destination
*
cache
-
lines
,
the
two
last
stages
are
used
to
copy
the
cache
-
line
words
not
copied
by
*
the
prefetches
.
The
four
relevant
points
in
the
pipelined
are
called
A
,
B
,
C
,
D
:
*
p
[
A
]
is
TRUE
if
a
source
-
line
should
be
prefetched
,
p
[
B
]
is
TRUE
if
a
destination
-
line
*
should
be
prefetched
,
p
[
C
]
is
TRUE
if
at
least
one
more
cacheline
needs
to
be
copied
,
*
and
p
[
D
]
is
TRUE
if
a
cachline
needs
to
be
copied
.
*
*
Note
that
L1
has
a
line
-
size
of
64
bytes
and
L2
a
line
-
size
of
128
bytes
.
To
avoid
*
secondary
misses
in
L2
,
we
prefetch
both
source
and
destination
with
a
line
-
size
*
of
128
bytes
.
When
both
of
these
lines
are
in
the
L2
and
the
first
half
of
the
*
source
line
is
in
L1
,
we
start
copying
the
remaining
words
.
The
second
half
of
the
*
source
line
is
prefetched
in
the
previous
iteration
,
so
that
by
the
time
we
start
*
accessing
it
,
it
's also present in the L1.
*
*
This
all
sounds
very
complicated
,
but
thanks
to
the
modulo
-
scheduled
loop
support
,
*
the
resulting
code
is
very
regular
and
quite
easy
to
follow
(
once
you
get
the
idea
)
.
*
*
As
a
secondary
optimization
,
the
first
2
*
PREFETCH_DIST
iterations
are
implemented
*
as
the
separate
.
prefetch_loop
.
Logically
,
this
loop
performs
exactly
like
the
*
main
-
loop
(
.
line_copy
),
but
has
all
know
-
to
-
be
-
predicated
-
off
instructions
removed
,
*
so
that
each
loop
iteration
is
faster
(
again
,
good
for
cached
case
)
.
*
*
When
reading
the
code
,
it
helps
to
keep
the
following
picture
in
mind
:
*
*
bank
0
bank
1
*
+------+------+---
*
| v[x] |
t1
|
^
*
| t2 |
t3
| |
*
| t4 |
t5
| |
*
| t6 |
t7
| |
128
bytes
*
| n8 |
t9
| |
(
L2
cache
line
)
*
| t10 |
t11
| |
*
| t12 |
t13
| |
*
| t14 |
t15
|
v
*
+------+------+---
*
*
Here
,
v
[
x
]
is
copied
by
the
(
memory
)
prefetch
.
n8
is
loaded
in
the
previous
iteration
*
to
fetch
the
second
-
half
of
the
L2
cache
line
into
L1
,
and
the
tX
words
are
copied
in
*
an
order
that
avoids
bank
conflicts
.
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
#define src0 r2
#define src1 r3
#define dst0 r9
#define dst1 r10
#define src_pre_mem r11
#define dst_pre_mem r14
#define src_pre_l2 r15
#define dst_pre_l2 r16
#define t1 r17
#define t2 r18
#define t3 r19
#define t4 r20
#define t5 t1 // alias!
#define t6 t2 // alias!
#define t7 t3 // alias!
#define n8 r21
#define t9 t5 // alias!
#define t10 t4 // alias!
#define t11 t7 // alias!
#define t12 t6 // alias!
#define t14 t10 // alias!
#define t13 r22
#define t15 r23
#define saved_lc r24
#define saved_pr r25
#define A 0
#define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST)
#define D (C + 1)
#define N (D + 1)
#define Nrot ((N + 7) & ~7)
GLOBAL_ENTRY
(
copy_page
)
.
prologue
alloc
r8
=
ar
.
pfs
,
2
,
Nrot
-
2
,
0
,
Nrot
.
rotr
v
[
2
*
PREFETCH_DIST
]
.
rotp
p
[
N
]
.
save
ar
.
lc
,
saved_lc
mov
saved_lc
=
ar
.
lc
.
save
pr
,
saved_pr
mov
saved_pr
=
pr
.
body
mov
src_pre_mem
=
in1
mov
pr
.
rot
=
0x10000
mov
ar
.
ec
=
1
//
special
unrolled
loop
mov
dst_pre_mem
=
in0
mov
ar
.
lc
=
2
*
PREFETCH_DIST
-
1
add
src_pre_l2
=
8
*
8
,
in1
add
dst_pre_l2
=
8
*
8
,
in0
add
src0
=
8
,
in1
//
first
t1
src
add
src1
=
3
*
8
,
in1
//
first
t3
src
add
dst0
=
8
,
in0
//
first
t1
dst
add
dst1
=
3
*
8
,
in0
//
first
t3
dst
;;
//
same
as
.
line_copy
loop
,
but
with
all
predicated
-
off
instructions
removed
:
.
prefetch_loop
:
(
p
[
A
])
ld8
v
[
A
]
=
[
src_pre_mem
],
128
//
M0
(
p
[
B
])
st8
[
dst_pre_mem
]
=
v
[
B
],
128
//
M2
br.ctop.sptk
.
prefetch_loop
;;
cmp.eq
p16
,
p0
=
r0
,
r0
//
reset
p16
to
1
(
br
.
ctop
cleared
it
to
zero
)
mov
ar
.
lc
=
(
PAGE_SIZE
/
128
)
-
(
2
*
PREFETCH_DIST
)
-
1
mov
ar
.
ec
=
N
//
#
of
stages
in
pipeline
;;
.
align
32
.
line_copy
:
(
p
[
D
])
ld8
t2
=
[
src0
],
3
*
8
//
M0
(
p
[
D
])
ld8
t4
=
[
src1
],
3
*
8
//
M1
(
p
[
B
])
st8
[
dst_pre_mem
]
=
v
[
B
],
128
//
M2
prefetch
dst
from
memory
(
p
[
D
])
st8
[
dst_pre_l2
]
=
n8
,
128
//
M3
prefetch
dst
from
L2
;;
(
p
[
A
])
ld8
v
[
A
]
=
[
src_pre_mem
],
128
//
M0
prefetch
src
from
memory
(
p
[
C
])
ld8
n8
=
[
src_pre_l2
],
128
//
M1
prefetch
src
from
L2
(
p
[
D
])
st8
[
dst0
]
=
t1
,
8
//
M2
(
p
[
D
])
st8
[
dst1
]
=
t3
,
8
//
M3
;;
(
p
[
D
])
ld8
t5
=
[
src0
],
8
(
p
[
D
])
ld8
t7
=
[
src1
],
3
*
8
(
p
[
D
])
st8
[
dst0
]
=
t2
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t4
,
3
*
8
;;
(
p
[
D
])
ld8
t6
=
[
src0
],
3
*
8
(
p
[
D
])
ld8
t10
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t5
,
8
(
p
[
D
])
st8
[
dst1
]
=
t7
,
3
*
8
;;
(
p
[
D
])
ld8
t9
=
[
src0
],
3
*
8
(
p
[
D
])
ld8
t11
=
[
src1
],
3
*
8
(
p
[
D
])
st8
[
dst0
]
=
t6
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t10
,
8
;;
(
p
[
D
])
ld8
t12
=
[
src0
],
8
(
p
[
D
])
ld8
t14
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t9
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t11
,
3
*
8
;;
(
p
[
D
])
ld8
t13
=
[
src0
],
4
*
8
(
p
[
D
])
ld8
t15
=
[
src1
],
4
*
8
(
p
[
D
])
st8
[
dst0
]
=
t12
,
8
(
p
[
D
])
st8
[
dst1
]
=
t14
,
8
;;
(
p
[
C
])
ld8
t1
=
[
src0
],
8
(
p
[
C
])
ld8
t3
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t13
,
4
*
8
(
p
[
D
])
st8
[
dst1
]
=
t15
,
4
*
8
br.ctop.sptk
.
line_copy
;;
mov
ar
.
lc
=
saved_lc
mov
pr
=
saved_pr
,
-
1
br.ret.sptk.many
rp
END
(
copy_page
)
include/asm-ia64/thread_info.h
View file @
5b9c44cf
...
@@ -12,7 +12,8 @@
...
@@ -12,7 +12,8 @@
#define TI_EXEC_DOMAIN 0x00
#define TI_EXEC_DOMAIN 0x00
#define TI_FLAGS 0x08
#define TI_FLAGS 0x08
#define TI_CPU 0x0c
#define TI_CPU 0x0c
#define TI_ADDR_LIMI 0x10
#define TI_ADDR_LIMIT 0x10
#define TI_PRE_COUNT 0x18
#ifndef __ASSEMBLY__
#ifndef __ASSEMBLY__
...
@@ -26,6 +27,7 @@ struct thread_info {
...
@@ -26,6 +27,7 @@ struct thread_info {
__u32
flags
;
/* thread_info flags (see TIF_*) */
__u32
flags
;
/* thread_info flags (see TIF_*) */
__u32
cpu
;
/* current CPU */
__u32
cpu
;
/* current CPU */
mm_segment_t
addr_limit
;
/* user-level address space limit */
mm_segment_t
addr_limit
;
/* user-level address space limit */
__s32
preempt_count
;
/* 0=premptable, <0=BUG; will also serve as bh-counter */
};
};
#define INIT_THREAD_SIZE
/* tell sched.h not to declare the thread_union */
#define INIT_THREAD_SIZE
/* tell sched.h not to declare the thread_union */
...
@@ -37,6 +39,7 @@ struct thread_info {
...
@@ -37,6 +39,7 @@ struct thread_info {
flags: 0, \
flags: 0, \
cpu: 0, \
cpu: 0, \
addr_limit: KERNEL_DS, \
addr_limit: KERNEL_DS, \
preempt_count: 0, \
}
}
/* how to get the thread information struct from C */
/* how to get the thread information struct from C */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment