android_kernel_samsung_msm8976/arch/powerpc/platforms/pseries/eeh_event.c
Linas Vepstas 054d8ff377 [PATCH] powerpc/pseries: avoid crash in PCI code if mem system not up
The powerpc code is currently performing PCI setup before memory
initialization.  PCI setup touches PCI config space registers.  If the PCI
card is bad, this will evoke an error, which currrently can't be handled,
as the PCI error recovery code expects kmalloc() to be functional.  This
patch will cause the system to punt instead of crashing with

cpu 0x0: Vector: 300 (Data Access) at [c0000000004434d0]
    pc: c0000000000c06b4: .kmem_cache_alloc+0x8c/0xf4
    lr: c00000000004ad6c: .eeh_send_failure_event+0x48/0xfc

This patch will also print name of the offending pci device.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-05-03 23:06:40 +10:00

154 lines
4.3 KiB
C

/*
* eeh_event.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/workqueue.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
/** Overview:
* EEH error states may be detected within exception handlers;
* however, the recovery processing needs to occur asynchronously
* in a normal kernel context and not an interrupt context.
* This pair of routines creates an event and queues it onto a
* work-queue, where a worker thread can drive recovery.
*/
/* EEH event workqueue setup. */
static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
LIST_HEAD(eeh_eventlist);
static void eeh_thread_launcher(void *);
DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
/* Serialize reset sequences for a given pci device */
DEFINE_MUTEX(eeh_event_mutex);
/**
* eeh_event_handler - dispatch EEH events.
* @dummy - unused
*
* The detection of a frozen slot can occur inside an interrupt,
* where it can be hard to do anything about it. The goal of this
* routine is to pull these detection events out of the context
* of the interrupt handler, and re-dispatch them for processing
* at a later time in a normal context.
*/
static int eeh_event_handler(void * dummy)
{
unsigned long flags;
struct eeh_event *event;
daemonize ("eehd");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL;
/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL)
break;
/* Serialize processing of EEH events */
mutex_lock(&eeh_event_mutex);
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
pci_name(event->dev));
handle_eeh_events(event);
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event);
mutex_unlock(&eeh_event_mutex);
}
return 0;
}
/**
* eeh_thread_launcher
* @dummy - unused
*/
static void eeh_thread_launcher(void *dummy)
{
if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
printk(KERN_ERR "Failed to start EEH daemon\n");
}
/**
* eeh_send_failure_event - generate a PCI error event
* @dev pci device
*
* This routine can be called within an interrupt context;
* the actual event will be delivered in a normal context
* (from a workqueue).
*/
int eeh_send_failure_event (struct device_node *dn,
struct pci_dev *dev,
enum pci_channel_state state,
int time_unavail)
{
unsigned long flags;
struct eeh_event *event;
char *location;
if (!mem_init_done) {
printk(KERN_ERR "EEH: event during early boot not handled\n");
location = (char *) get_property(dn, "ibm,loc-code", NULL);
printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
printk(KERN_ERR "EEH: PCI location = %s\n", location);
return 1;
}
event = kmalloc(sizeof(*event), GFP_ATOMIC);
if (event == NULL) {
printk (KERN_ERR "EEH: out of memory, event not handled\n");
return 1;
}
if (dev)
pci_dev_get(dev);
event->dn = dn;
event->dev = dev;
event->state = state;
event->time_unavail = time_unavail;
/* We may or may not be called in an interrupt context */
spin_lock_irqsave(&eeh_eventlist_lock, flags);
list_add(&event->list, &eeh_eventlist);
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
schedule_work(&eeh_event_wq);
return 0;
}
/********************** END OF FILE ******************************/