Commit d524c813 authored by Dave Airlie's avatar Dave Airlie

drm: radeon hyperz support..

HyperZ is an undocumented feature (outside of ATI) of the radeon
chips, this is a reverse engineered implementation.

From: Roland Scheidegger <rscheidegger_lists@hispeed.ch> and
Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Signed-off-by: default avatarDave Airlie <airlied@linux.ie>
parent 4a066c6b
......@@ -1346,7 +1346,7 @@ static int radeon_do_init_cp( drm_device_t *dev, drm_radeon_init_t *init )
*/
dev_priv->depth_clear.rb3d_cntl = (RADEON_PLANE_MASK_ENABLE |
(dev_priv->color_fmt << 10) |
(1<<15));
(dev_priv->microcode_version == UCODE_R100 ? RADEON_ZBLOCK16 : 0));
dev_priv->depth_clear.rb3d_zstencilcntl =
(dev_priv->depth_fmt |
......
......@@ -193,7 +193,10 @@ typedef union {
#define RADEON_FRONT 0x1
#define RADEON_BACK 0x2
#define RADEON_DEPTH 0x4
#define RADEON_STENCIL 0x8
#define RADEON_STENCIL 0x8
#define RADEON_CLEAR_FASTZ 0x80000000
#define RADEON_USE_HIERZ 0x40000000
#define RADEON_USE_COMP_ZBUF 0x20000000
/* Primitive types
*/
......
......@@ -38,7 +38,7 @@
#define DRIVER_NAME "radeon"
#define DRIVER_DESC "ATI Radeon"
#define DRIVER_DATE "20020828"
#define DRIVER_DATE "20041207"
/* Interface history:
*
......@@ -74,9 +74,11 @@
* and GL_EXT_blend_[func|equation]_separate on r200
* 1.12- Add R300 CP microcode support - this just loads the CP on r300
* (No 3D support yet - just microcode loading)
* 1.13- Add packet R200_EMIT_TCL_POINT_SPRITE_CNTL for ARB_point_parameters
* - Add hyperz support, add hyperz flags to clear ioctl.
*/
#define DRIVER_MAJOR 1
#define DRIVER_MINOR 12
#define DRIVER_MINOR 13
#define DRIVER_PATCHLEVEL 0
#define GET_RING_HEAD(dev_priv) DRM_READ32( (dev_priv)->ring_rptr, 0 )
......@@ -461,7 +463,9 @@ extern int radeon_postcleanup( struct drm_device *dev );
# define RADEON_ROP_ENABLE (1 << 6)
# define RADEON_STENCIL_ENABLE (1 << 7)
# define RADEON_Z_ENABLE (1 << 8)
# define RADEON_ZBLOCK16 (1 << 15)
#define RADEON_RB3D_DEPTHOFFSET 0x1c24
#define RADEON_RB3D_DEPTHCLEARVALUE 0x3230
#define RADEON_RB3D_DEPTHPITCH 0x1c28
#define RADEON_RB3D_PLANEMASK 0x1d84
#define RADEON_RB3D_STENCILREFMASK 0x1d7c
......@@ -474,11 +478,15 @@ extern int radeon_postcleanup( struct drm_device *dev );
#define RADEON_RB3D_ZSTENCILCNTL 0x1c2c
# define RADEON_Z_TEST_MASK (7 << 4)
# define RADEON_Z_TEST_ALWAYS (7 << 4)
# define RADEON_Z_HIERARCHY_ENABLE (1 << 8)
# define RADEON_STENCIL_TEST_ALWAYS (7 << 12)
# define RADEON_STENCIL_S_FAIL_REPLACE (2 << 16)
# define RADEON_STENCIL_ZPASS_REPLACE (2 << 20)
# define RADEON_STENCIL_ZFAIL_REPLACE (2 << 24)
# define RADEON_Z_COMPRESSION_ENABLE (1 << 28)
# define RADEON_FORCE_Z_DIRTY (1 << 29)
# define RADEON_Z_WRITE_ENABLE (1 << 30)
# define RADEON_Z_DECOMPRESSION_ENABLE (1 << 31)
#define RADEON_RBBM_SOFT_RESET 0x00f0
# define RADEON_SOFT_RESET_CP (1 << 0)
# define RADEON_SOFT_RESET_HI (1 << 1)
......@@ -586,7 +594,7 @@ extern int radeon_postcleanup( struct drm_device *dev );
# define RADEON_WAIT_3D_IDLECLEAN (1 << 17)
# define RADEON_WAIT_HOST_IDLECLEAN (1 << 18)
#define RADEON_RB3D_ZMASKOFFSET 0x1c34
#define RADEON_RB3D_ZMASKOFFSET 0x3234
#define RADEON_RB3D_ZSTENCILCNTL 0x1c2c
# define RADEON_DEPTH_FORMAT_16BIT_INT_Z (0 << 0)
# define RADEON_DEPTH_FORMAT_24BIT_INT_Z (2 << 0)
......@@ -641,6 +649,8 @@ extern int radeon_postcleanup( struct drm_device *dev );
# define RADEON_3D_DRAW_IMMD 0x00002900
# define RADEON_3D_DRAW_INDX 0x00002A00
# define RADEON_3D_LOAD_VBPNTR 0x00002F00
# define RADEON_3D_CLEAR_ZMASK 0x00003200
# define RADEON_3D_CLEAR_HIZ 0x00003700
# define RADEON_CNTL_HOSTDATA_BLT 0x00009400
# define RADEON_CNTL_PAINT_MULTI 0x00009A00
# define RADEON_CNTL_BITBLT_MULTI 0x00009B00
......
......@@ -808,6 +808,153 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
}
}
}
/* hyper z clear */
/* no docs available, based on reverse engeneering by Stephane Marchesin */
if ((flags & (RADEON_DEPTH | RADEON_STENCIL)) && (flags & RADEON_CLEAR_FASTZ)) {
int i;
int depthpixperline = dev_priv->depth_fmt==RADEON_DEPTH_FORMAT_16BIT_INT_Z?
(dev_priv->depth_pitch / 2): (dev_priv->depth_pitch / 4);
u32 clearmask;
u32 tempRB3D_DEPTHCLEARVALUE = clear->clear_depth |
((clear->depth_mask & 0xff) << 24);
/* Make sure we restore the 3D state next time.
* we haven't touched any "normal" state - still need this?
*/
dev_priv->sarea_priv->ctx_owner = 0;
if ((dev_priv->flags & CHIP_HAS_HIERZ) && (flags & RADEON_USE_HIERZ)) {
/* FIXME : reverse engineer that for Rx00 cards */
/* FIXME : the mask supposedly contains low-res z values. So can't set
just to the max (0xff? or actually 0x3fff?), need to take z clear
value into account? */
/* pattern seems to work for r100, though get slight
rendering errors with glxgears. If hierz is not enabled for r100,
only 4 bits which indicate clear (15,16,31,32, all zero) matter, the
other ones are ignored, and the same clear mask can be used. That's
very different behaviour than R200 which needs different clear mask
and different number of tiles to clear if hierz is enabled or not !?!
*/
clearmask = (0xff<<22)|(0xff<<6)| 0x003f003f;
}
else {
/* clear mask : chooses the clearing pattern.
rv250: could be used to clear only parts of macrotiles
(but that would get really complicated...)?
bit 0 and 1 (either or both of them ?!?!) are used to
not clear tile (or maybe one of the bits indicates if the tile is
compressed or not), bit 2 and 3 to not clear tile 1,...,.
Pattern is as follows:
| 0,1 | 4,5 | 8,9 |12,13|16,17|20,21|24,25|28,29|
bits -------------------------------------------------
| 2,3 | 6,7 |10,11|14,15|18,19|22,23|26,27|30,31|
rv100: clearmask covers 2x8 4x1 tiles, but one clear still
covers 256 pixels ?!?
*/
clearmask = 0x0;
}
BEGIN_RING( 8 );
RADEON_WAIT_UNTIL_2D_IDLE();
OUT_RING_REG( RADEON_RB3D_DEPTHCLEARVALUE,
tempRB3D_DEPTHCLEARVALUE);
/* what offset is this exactly ? */
OUT_RING_REG( RADEON_RB3D_ZMASKOFFSET, 0 );
/* need ctlstat, otherwise get some strange black flickering */
OUT_RING_REG( RADEON_RB3D_ZCACHE_CTLSTAT, RADEON_RB3D_ZC_FLUSH_ALL );
ADVANCE_RING();
for (i = 0; i < nbox; i++) {
int tileoffset, nrtilesx, nrtilesy, j;
/* it looks like r200 needs rv-style clears, at least if hierz is not enabled? */
if ((dev_priv->flags&CHIP_HAS_HIERZ) && !(dev_priv->microcode_version==UCODE_R200)) {
/* FIXME : figure this out for r200 (when hierz is enabled). Or
maybe r200 actually doesn't need to put the low-res z value into
the tile cache like r100, but just needs to clear the hi-level z-buffer?
Works for R100, both with hierz and without.
R100 seems to operate on 2x1 8x8 tiles, but...
odd: offset/nrtiles need to be 64 pix (4 block) aligned? Potentially
problematic with resolutions which are not 64 pix aligned? */
tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 6;
nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
for (j = 0; j <= nrtilesy; j++) {
BEGIN_RING( 4 );
OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
/* first tile */
OUT_RING( tileoffset * 8 );
/* the number of tiles to clear */
OUT_RING( nrtilesx + 4 );
/* clear mask : chooses the clearing pattern. */
OUT_RING( clearmask );
ADVANCE_RING();
tileoffset += depthpixperline >> 6;
}
}
else if (dev_priv->microcode_version==UCODE_R200) {
/* works for rv250. */
/* find first macro tile (8x2 4x4 z-pixels on rv250) */
tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 5;
nrtilesx = (pbox[i].x2 >> 5) - (pbox[i].x1 >> 5);
nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
for (j = 0; j <= nrtilesy; j++) {
BEGIN_RING( 4 );
OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
/* first tile */
/* judging by the first tile offset needed, could possibly
directly address/clear 4x4 tiles instead of 8x2 * 4x4
macro tiles, though would still need clear mask for
right/bottom if truely 4x4 granularity is desired ? */
OUT_RING( tileoffset * 16 );
/* the number of tiles to clear */
OUT_RING( nrtilesx + 1 );
/* clear mask : chooses the clearing pattern. */
OUT_RING( clearmask );
ADVANCE_RING();
tileoffset += depthpixperline >> 5;
}
}
else { /* rv 100 */
/* rv100 might not need 64 pix alignment, who knows */
/* offsets are, hmm, weird */
tileoffset = ((pbox[i].y1 >> 4) * depthpixperline + pbox[i].x1) >> 6;
nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
nrtilesy = (pbox[i].y2 >> 4) - (pbox[i].y1 >> 4);
for (j = 0; j <= nrtilesy; j++) {
BEGIN_RING( 4 );
OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
OUT_RING( tileoffset * 128 );
/* the number of tiles to clear */
OUT_RING( nrtilesx + 4 );
/* clear mask : chooses the clearing pattern. */
OUT_RING( clearmask );
ADVANCE_RING();
tileoffset += depthpixperline >> 6;
}
}
}
/* TODO don't always clear all hi-level z tiles */
if ((dev_priv->flags & CHIP_HAS_HIERZ) && (dev_priv->microcode_version==UCODE_R200)
&& (flags & RADEON_USE_HIERZ))
/* r100 and cards without hierarchical z-buffer have no high-level z-buffer */
/* FIXME : the mask supposedly contains low-res z values. So can't set
just to the max (0xff? or actually 0x3fff?), need to take z clear
value into account? */
{
BEGIN_RING( 4 );
OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_HIZ, 2 ) );
OUT_RING( 0x0 ); /* First tile */
OUT_RING( 0x3cc0 );
OUT_RING( (0xff<<22)|(0xff<<6)| 0x003f003f);
ADVANCE_RING();
}
}
/* We have to clear the depth and/or stencil buffers by
* rendering a quad into just those buffers. Thus, we have to
......@@ -833,7 +980,6 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
tempRE_CNTL = 0;
tempRB3D_CNTL = depth_clear->rb3d_cntl;
tempRB3D_CNTL &= ~(1<<15); /* unset radeon magic flag */
tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl;
tempRB3D_STENCILREFMASK = 0x0;
......@@ -884,6 +1030,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
tempRB3D_STENCILREFMASK = 0x00000000;
}
if (flags & RADEON_USE_COMP_ZBUF) {
tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
RADEON_Z_DECOMPRESSION_ENABLE;
}
if (flags & RADEON_USE_HIERZ) {
tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
}
BEGIN_RING( 26 );
RADEON_WAIT_UNTIL_2D_IDLE();
......@@ -938,6 +1092,8 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
}
else if ( (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) {
int tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl;
rb3d_cntl = depth_clear->rb3d_cntl;
if ( flags & RADEON_DEPTH ) {
......@@ -954,6 +1110,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
rb3d_stencilrefmask = 0x00000000;
}
if (flags & RADEON_USE_COMP_ZBUF) {
tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
RADEON_Z_DECOMPRESSION_ENABLE;
}
if (flags & RADEON_USE_HIERZ) {
tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
}
BEGIN_RING( 13 );
RADEON_WAIT_UNTIL_2D_IDLE();
......@@ -961,8 +1125,7 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
OUT_RING( 0x00000000 );
OUT_RING( rb3d_cntl );
OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL,
depth_clear->rb3d_zstencilcntl );
OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL, tempRB3D_ZSTENCILCNTL );
OUT_RING_REG( RADEON_RB3D_STENCILREFMASK,
rb3d_stencilrefmask );
OUT_RING_REG( RADEON_RB3D_PLANEMASK,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment