Commit 6b332fa2 authored by Arun Siluvery's avatar Arun Siluvery Committed by Tvrtko Ursulin

drm/i915/guc: reset GuC and retry on firmware load failure

Due to timing issues in the HW, some of the status bits required for GuC
authentication occasionally don't get set; when that happens, the GuC
cannot be initialized and we will be left with a wedged GPU. The W/A
suggested is to perform a soft reset of the GuC and attempt to reload
the F/W again for few times before giving up.

As the failure is dependent on timing, tests performed by triggering
manual full gpu reset (i915_wedged) showed that we could sometimes hit
this after several thousand iterations, but sometimes tests ran even
longer without any issues. Reset and reload mechanism proved helpful
when we indeed hit f/w load failure, so it is better to include this
to improve driver stability.

This change implements the following WAs,

	WaEnableuKernelHeaderValidFix:skl,bxt
	WaEnableGuCBootHashCheckNotSet:skl,bxt
Signed-off-by: default avatarArun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: default avatarDave Gordon <david.s.gordon@intel.com>
Reviewed-by: default avatarAlex Dai <yu.dai@intel.com>
Signed-off-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
parent 168cf367
......@@ -2747,6 +2747,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_device *dev);
extern int i915_reset(struct drm_device *dev);
extern int intel_guc_reset(struct drm_i915_private *dev_priv);
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
......
......@@ -27,6 +27,7 @@
/* Definitions of GuC H/W registers, bits, etc */
#define GUC_STATUS _MMIO(0xc000)
#define GS_MIA_IN_RESET (1 << 0)
#define GS_BOOTROM_SHIFT 1
#define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT)
#define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT)
......
......@@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define GEN6_GRDOM_MEDIA (1 << 2)
#define GEN6_GRDOM_BLT (1 << 3)
#define GEN6_GRDOM_VECS (1 << 4)
#define GEN9_GRDOM_GUC (1 << 5)
#define GEN8_GRDOM_MEDIA2 (1 << 7)
#define RING_PP_DIR_BASE(ring) _MMIO((ring)->mmio_base+0x228)
......
......@@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
return ret;
}
static int i915_reset_guc(struct drm_i915_private *dev_priv)
{
int ret;
u32 guc_status;
ret = intel_guc_reset(dev_priv);
if (ret) {
DRM_ERROR("GuC reset failed, ret = %d\n", ret);
return ret;
}
guc_status = I915_READ(GUC_STATUS);
WARN(!(guc_status & GS_MIA_IN_RESET),
"GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);
return ret;
}
/**
* intel_guc_ucode_load() - load GuC uCode into the device
* @dev: drm device
......@@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev)
if (err)
goto fail;
/*
* WaEnableuKernelHeaderValidFix:skl,bxt
* For BXT, this is only upto B0 but below WA is required for later
* steppings also so this is extended as well.
*/
/* WaEnableGuCBootHashCheckNotSet:skl,bxt */
err = guc_ucode_xfer(dev_priv);
if (err) {
int retries = 3;
DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);
while (retries--) {
err = i915_reset_guc(dev_priv);
if (err)
break;
err = guc_ucode_xfer(dev_priv);
if (!err) {
DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
break;
}
DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
}
if (err) {
DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
goto fail;
}
}
guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
......
......@@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev)
return intel_get_gpu_reset(dev) != NULL;
}
int intel_guc_reset(struct drm_i915_private *dev_priv)
{
int ret;
unsigned long irqflags;
if (!i915.enable_guc_submission)
return -EINVAL;
intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC);
spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
return ret;
}
bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
{
return check_for_unclaimed_mmio(dev_priv);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment