edac: cortex_arm64: Poll to check for cache errors

By design, the CortexA53/A57 processors are incapable of
gernerating interrupts or PMU events once a single-bit
error is observed in the L2 caches.
Hence, we need to poll the L2MERRSR register to periodically check
for single bit errors. We need to do this for L2 on both clusters.

Change-Id: I76a440b820f23c9667a5596cf550ff7725ec1cf5
Signed-off-by: Rohit Vaswani <rvaswani@codeaurora.org>
This commit is contained in:
Rohit Vaswani 2014-10-03 14:32:55 -07:00
parent 093d936988
commit 15c8581540
4 changed files with 35 additions and 8 deletions

View File

@ -22,6 +22,7 @@ Optional properties:
- reg: Should contain physical address of the CCI register space
- reg-names: Should contain 'cci'. Must be present if 'reg' property is present
- qcom,apply-cti-pmu-wa: Indicates if the driver needs to apply the CTI PMU Workaround. Relevant for 8994V1.
- poll-delay-msec: Indicates how often the edac check callback should be called. Time in msec.
Example:
cpu_cache_erp {

View File

@ -15,10 +15,10 @@
#ifdef CONFIG_EDAC_CORTEX_ARM64
void arm64_erp_local_dbe_handler(void);
void arm64_check_cache_ecc(void);
void arm64_check_cache_ecc(void *info);
#else
static inline void arm64_erp_local_dbe_handler(void) { }
static inline void arm64_check_cache_ecc(void) { }
static inline void arm64_check_cache_ecc(void *info) { }
#endif
static inline void atomic_scrub(void *addr, int size)

View File

@ -557,7 +557,7 @@ static void ipi_cpu_stop(unsigned int cpu, struct pt_regs *regs)
pr_crit("CPU%u: stopping\n", cpu);
show_regs(regs);
dump_stack();
arm64_check_cache_ecc();
arm64_check_cache_ecc(NULL);
raw_spin_unlock(&stop_lock);
}

View File

@ -786,7 +786,7 @@ static int msm_cti_pmu_wa_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
void arm64_check_cache_ecc(void)
void arm64_check_cache_ecc(void *info)
{
if (panic_handler_drvdata)
check_sbe_event(panic_handler_drvdata);
@ -795,17 +795,36 @@ void arm64_check_cache_ecc(void)
static int arm64_erp_panic_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
arm64_check_cache_ecc();
arm64_check_cache_ecc(NULL);
return NOTIFY_OK;
}
static void arm64_monitor_cache_errors(struct edac_device_ctl_info *edev)
{
struct cpumask cluster_mask, old_mask;
int cpu;
cpumask_clear(&cluster_mask);
cpumask_clear(&old_mask);
for_each_possible_cpu(cpu) {
cpumask_copy(&cluster_mask, topology_core_cpumask(cpu));
if (cpumask_equal(&cluster_mask, &old_mask))
continue;
cpumask_copy(&old_mask, &cluster_mask);
smp_call_function_any(&cluster_mask,
arm64_check_cache_ecc, NULL, 0);
}
}
static int arm64_cpu_erp_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct erp_drvdata *drv;
struct resource *r;
int cpu;
u32 poll_msec;
struct erp_drvdata * __percpu *drv_cpu =
alloc_percpu(struct erp_drvdata *);
@ -823,18 +842,24 @@ static int arm64_cpu_erp_probe(struct platform_device *pdev)
if (!drv->edev_ctl)
return -ENOMEM;
rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-ms",
&poll_msec);
if (!rc) {
drv->edev_ctl->edac_check = arm64_monitor_cache_errors;
drv->edev_ctl->poll_msec = poll_msec;
drv->edev_ctl->defer_work = 1;
}
drv->edev_ctl->dev = dev;
drv->edev_ctl->mod_name = dev_name(dev);
drv->edev_ctl->dev_name = dev_name(dev);
drv->edev_ctl->ctl_name = "cache";
drv->edev_ctl->panic_on_ce = panic_on_ce;
drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
rc = edac_device_add_device(drv->edev_ctl);
if (rc)
goto out_mem;
drv->edev_ctl->panic_on_ce = panic_on_ce;
drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cci");
if (r)
drv->cci_base = devm_ioremap_resource(dev, r);
@ -921,6 +946,7 @@ out_irq:
abort_handler_drvdata = drv;
panic_handler_drvdata = drv;
return 0;
out_dev: