M7350v1_en_gpl

This commit is contained in:
T
2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
@@ -0,0 +1,132 @@
config PPC_PSERIES
depends on PPC64 && PPC_BOOK3S
bool "IBM pSeries & new (POWER5-based) iSeries"
select HAVE_PCSPKR_PLATFORM
select MPIC
select OF_DYNAMIC
select PCI_MSI
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_ICP_HV
select PPC_ICS_RTAS
select PPC_I8259
select PPC_RTAS
select PPC_RTAS_DAEMON
select RTAS_ERROR_LOGGING
select PPC_UDBG_16550
select PPC_NATIVE
select PPC_PCI_CHOICE if EXPERT
select ZLIB_DEFLATE
default y
config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
default n
help
Enabling this option will make the kernel run more efficiently
on logically-partitioned pSeries systems which use shared
processors, that is, which share physical processors between
two or more partitions.
config EEH
bool
depends on PPC_PSERIES && PCI
default y
config PSERIES_MSI
bool
depends on PCI_MSI && EEH
default y
config PSERIES_ENERGY
tristate "pSeries energy management capabilities driver"
depends on PPC_PSERIES
default y
help
Provides interface to platform energy management capabilities
on supported PSERIES platforms.
Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
config SCANLOG
tristate "Scanlog dump interface"
depends on RTAS_PROC && PPC_PSERIES
config IO_EVENT_IRQ
bool "IO Event Interrupt support"
depends on PPC_PSERIES
default y
help
Select this option, if you want to enable support for IO Event
interrupts. IO event interrupt is a mechanism provided by RTAS
to return information about hardware error and non-error events
which may need OS attention. RTAS returns events for multiple
event types and scopes. Device drivers can register their handlers
to receive events.
This option will only enable the IO event platform code. You
will still need to enable or compile the actual drivers
that use this infrastruture to handle IO event interrupts.
Say Y if you are unsure.
config LPARCFG
bool "LPAR Configuration Data"
depends on PPC_PSERIES
help
Provide system capacity information via human readable
<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
config PPC_PSERIES_DEBUG
depends on PPC_PSERIES && PPC_EARLY_DEBUG
bool "Enable extra debug logging in platforms/pseries"
help
Say Y here if you want the pseries core to produce a bunch of
debug messages to the system log. Select this if you are having a
problem with the pseries core and want to see more of what is
going on. This does not enable debugging in lpar.c, which must
be manually done due to its verbosity.
default y
config PPC_SMLPAR
bool "Support for shared-memory logical partitions"
depends on PPC_PSERIES
select LPARCFG
default n
help
Select this option to enable shared memory partition support.
With this option a system running in an LPAR can be given more
memory than physically available and will allow firmware to
balance memory across many LPARs.
config CMM
tristate "Collaborative memory management"
depends on PPC_SMLPAR
default y
help
Select this option, if you want to enable the kernel interface
to reduce the memory size of the system. This is accomplished
by allocating pages of memory and put them "on hold". This only
makes sense for a system running in an LPAR where the unused pages
will be reused for other LPARs. The interface allows firmware to
balance memory across many LPARs.
config DTL
bool "Dispatch Trace Log"
depends on PPC_SPLPAR && DEBUG_FS
help
SPLPAR machines can log hypervisor preempt & dispatch events to a
kernel buffer. Saying Y here will enable logging these events,
which are accessible through a debugfs file.
Say N if you are unsure.
config PSERIES_IDLE
bool "Cpuidle driver for pSeries platforms"
depends on CPU_IDLE
depends on PPC_PSERIES
default y
help
Select this option to enable processor idle state management
through cpuidle subsystem.
@@ -0,0 +1,29 @@
ccflags-$(CONFIG_PPC64) := -mno-minimal-toc
ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG
obj-y := lpar.o hvCall.o nvram.o reconfig.o \
setup.o iommu.o event_sources.o ras.o \
firmware.o power.o dlpar.o mobility.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SCANLOG) += scanlog.o
obj-$(CONFIG_EEH) += eeh.o eeh_dev.o eeh_cache.o eeh_driver.o \
eeh_event.o eeh_sysfs.o eeh_pseries.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_PCI) += pci.o pci_dlpar.o
obj-$(CONFIG_PSERIES_MSI) += msi.o
obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o
obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o
obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o
obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
obj-$(CONFIG_HVCS) += hvcserver.o
obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_DTL) += dtl.o
obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o
obj-$(CONFIG_PSERIES_IDLE) += processor_idle.o
ifeq ($(CONFIG_PPC_PSERIES),y)
obj-$(CONFIG_SUSPEND) += suspend.o
endif
+742
View File
@@ -0,0 +1,742 @@
/*
* Collaborative memory management interface.
*
* Copyright (C) 2008 IBM Corporation
* Author(s): Brian King (brking@linux.vnet.ibm.com),
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/oom.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/stringify.h>
#include <linux/swap.h>
#include <linux/device.h>
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <linux/memory.h>
#include "plpar_wrappers.h"
#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
/*
* The priority level tries to ensure that this notifier is called as
* late as possible to reduce thrashing in the shared memory pool.
*/
#define CMM_MEM_HOTPLUG_PRI 1
#define CMM_MEM_ISOLATE_PRI 15
static unsigned int delay = CMM_DEFAULT_DELAY;
static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
static struct device cmm_dev;
MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
MODULE_LICENSE("GPL");
MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
"before loaning resumes. "
"[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
"[Default=" __stringify(CMM_MIN_MEM_MB) "]");
module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
"[Default=" __stringify(CMM_DEBUG) "]");
#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
struct cmm_page_array {
struct cmm_page_array *next;
unsigned long index;
unsigned long page[CMM_NR_PAGES];
};
static unsigned long loaned_pages;
static unsigned long loaned_pages_target;
static unsigned long oom_freed_pages;
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);
static DEFINE_MUTEX(hotplug_mutex);
static int hotplug_occurred; /* protected by the hotplug mutex */
static struct task_struct *cmm_thread_ptr;
/**
* cmm_alloc_pages - Allocate pages and mark them as loaned
* @nr: number of pages to allocate
*
* Return value:
* number of pages requested to be allocated which were not
**/
static long cmm_alloc_pages(long nr)
{
struct cmm_page_array *pa, *npa;
unsigned long addr;
long rc;
cmm_dbg("Begin request for %ld pages\n", nr);
while (nr) {
/* Exit if a hotplug operation is in progress or occurred */
if (mutex_trylock(&hotplug_mutex)) {
if (hotplug_occurred) {
mutex_unlock(&hotplug_mutex);
break;
}
mutex_unlock(&hotplug_mutex);
} else {
break;
}
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
break;
spin_lock(&cmm_lock);
pa = cmm_page_list;
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
npa = (struct cmm_page_array *)__get_free_page(
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
break;
}
spin_lock(&cmm_lock);
pa = cmm_page_list;
if (!pa || pa->index >= CMM_NR_PAGES) {
npa->next = pa;
npa->index = 0;
pa = npa;
cmm_page_list = pa;
} else
free_page((unsigned long) npa);
}
if ((rc = plpar_page_set_loaned(__pa(addr)))) {
pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
spin_unlock(&cmm_lock);
free_page(addr);
break;
}
pa->page[pa->index++] = addr;
loaned_pages++;
totalram_pages--;
spin_unlock(&cmm_lock);
nr--;
}
cmm_dbg("End request with %ld pages unfulfilled\n", nr);
return nr;
}
/**
* cmm_free_pages - Free pages and mark them as active
* @nr: number of pages to free
*
* Return value:
* number of pages requested to be freed which were not
**/
static long cmm_free_pages(long nr)
{
struct cmm_page_array *pa;
unsigned long addr;
cmm_dbg("Begin free of %ld pages.\n", nr);
spin_lock(&cmm_lock);
pa = cmm_page_list;
while (nr) {
if (!pa || pa->index <= 0)
break;
addr = pa->page[--pa->index];
if (pa->index == 0) {
pa = pa->next;
free_page((unsigned long) cmm_page_list);
cmm_page_list = pa;
}
plpar_page_set_active(__pa(addr));
free_page(addr);
loaned_pages--;
nr--;
totalram_pages++;
}
spin_unlock(&cmm_lock);
cmm_dbg("End request with %ld pages unfulfilled\n", nr);
return nr;
}
/**
* cmm_oom_notify - OOM notifier
* @self: notifier block struct
* @dummy: not used
* @parm: returned - number of pages freed
*
* Return value:
* NOTIFY_OK
**/
static int cmm_oom_notify(struct notifier_block *self,
unsigned long dummy, void *parm)
{
unsigned long *freed = parm;
long nr = KB2PAGES(oom_kb);
cmm_dbg("OOM processing started\n");
nr = cmm_free_pages(nr);
loaned_pages_target = loaned_pages;
*freed += KB2PAGES(oom_kb) - nr;
oom_freed_pages += KB2PAGES(oom_kb) - nr;
cmm_dbg("OOM processing complete\n");
return NOTIFY_OK;
}
/**
* cmm_get_mpp - Read memory performance parameters
*
* Makes hcall to query the current page loan request from the hypervisor.
*
* Return value:
* nothing
**/
static void cmm_get_mpp(void)
{
int rc;
struct hvcall_mpp_data mpp_data;
signed long active_pages_target, page_loan_request, target;
signed long total_pages = totalram_pages + loaned_pages;
signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
rc = h_get_mpp(&mpp_data);
if (rc != H_SUCCESS)
return;
page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
target = page_loan_request + (signed long)loaned_pages;
if (target < 0 || total_pages < min_mem_pages)
target = 0;
if (target > oom_freed_pages)
target -= oom_freed_pages;
else
target = 0;
active_pages_target = total_pages - target;
if (min_mem_pages > active_pages_target)
target = total_pages - min_mem_pages;
if (target < 0)
target = 0;
loaned_pages_target = target;
cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
page_loan_request, loaned_pages, loaned_pages_target,
oom_freed_pages, totalram_pages);
}
static struct notifier_block cmm_oom_nb = {
.notifier_call = cmm_oom_notify
};
/**
* cmm_thread - CMM task thread
* @dummy: not used
*
* Return value:
* 0
**/
static int cmm_thread(void *dummy)
{
unsigned long timeleft;
while (1) {
timeleft = msleep_interruptible(delay * 1000);
if (kthread_should_stop() || timeleft)
break;
if (mutex_trylock(&hotplug_mutex)) {
if (hotplug_occurred) {
hotplug_occurred = 0;
mutex_unlock(&hotplug_mutex);
cmm_dbg("Hotplug operation has occurred, "
"loaning activity suspended "
"for %d seconds.\n",
hotplug_delay);
timeleft = msleep_interruptible(hotplug_delay *
1000);
if (kthread_should_stop() || timeleft)
break;
continue;
}
mutex_unlock(&hotplug_mutex);
} else {
cmm_dbg("Hotplug operation in progress, activity "
"suspended\n");
continue;
}
cmm_get_mpp();
if (loaned_pages_target > loaned_pages) {
if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
loaned_pages_target = loaned_pages;
} else if (loaned_pages_target < loaned_pages)
cmm_free_pages(loaned_pages - loaned_pages_target);
}
return 0;
}
#define CMM_SHOW(name, format, args...) \
static ssize_t show_##name(struct device *dev, \
struct device_attribute *attr, \
char *buf) \
{ \
return sprintf(buf, format, ##args); \
} \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
static ssize_t show_oom_pages(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
}
static ssize_t store_oom_pages(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long val = simple_strtoul (buf, NULL, 10);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (val != 0)
return -EBADMSG;
oom_freed_pages = 0;
return count;
}
static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO,
show_oom_pages, store_oom_pages);
static struct device_attribute *cmm_attrs[] = {
&dev_attr_loaned_kb,
&dev_attr_loaned_target_kb,
&dev_attr_oom_freed_kb,
};
static struct bus_type cmm_subsys = {
.name = "cmm",
.dev_name = "cmm",
};
/**
* cmm_sysfs_register - Register with sysfs
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_sysfs_register(struct device *dev)
{
int i, rc;
if ((rc = subsys_system_register(&cmm_subsys, NULL)))
return rc;
dev->id = 0;
dev->bus = &cmm_subsys;
if ((rc = device_register(dev)))
goto subsys_unregister;
for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
if ((rc = device_create_file(dev, cmm_attrs[i])))
goto fail;
}
return 0;
fail:
while (--i >= 0)
device_remove_file(dev, cmm_attrs[i]);
device_unregister(dev);
subsys_unregister:
bus_unregister(&cmm_subsys);
return rc;
}
/**
* cmm_unregister_sysfs - Unregister from sysfs
*
**/
static void cmm_unregister_sysfs(struct device *dev)
{
int i;
for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
device_remove_file(dev, cmm_attrs[i]);
device_unregister(dev);
bus_unregister(&cmm_subsys);
}
/**
* cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
*
**/
static int cmm_reboot_notifier(struct notifier_block *nb,
unsigned long action, void *unused)
{
if (action == SYS_RESTART) {
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
cmm_thread_ptr = NULL;
cmm_free_pages(loaned_pages);
}
return NOTIFY_DONE;
}
static struct notifier_block cmm_reboot_nb = {
.notifier_call = cmm_reboot_notifier,
};
/**
* cmm_count_pages - Count the number of pages loaned in a particular range.
*
* @arg: memory_isolate_notify structure with address range and count
*
* Return value:
* 0 on success
**/
static unsigned long cmm_count_pages(void *arg)
{
struct memory_isolate_notify *marg = arg;
struct cmm_page_array *pa;
unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
unsigned long idx;
spin_lock(&cmm_lock);
pa = cmm_page_list;
while (pa) {
if ((unsigned long)pa >= start && (unsigned long)pa < end)
marg->pages_found++;
for (idx = 0; idx < pa->index; idx++)
if (pa->page[idx] >= start && pa->page[idx] < end)
marg->pages_found++;
pa = pa->next;
}
spin_unlock(&cmm_lock);
return 0;
}
/**
* cmm_memory_isolate_cb - Handle memory isolation notifier calls
* @self: notifier block struct
* @action: action to take
* @arg: struct memory_isolate_notify data for handler
*
* Return value:
* NOTIFY_OK or notifier error based on subfunction return value
**/
static int cmm_memory_isolate_cb(struct notifier_block *self,
unsigned long action, void *arg)
{
int ret = 0;
if (action == MEM_ISOLATE_COUNT)
ret = cmm_count_pages(arg);
return notifier_from_errno(ret);
}
static struct notifier_block cmm_mem_isolate_nb = {
.notifier_call = cmm_memory_isolate_cb,
.priority = CMM_MEM_ISOLATE_PRI
};
/**
* cmm_mem_going_offline - Unloan pages where memory is to be removed
* @arg: memory_notify structure with page range to be offlined
*
* Return value:
* 0 on success
**/
static int cmm_mem_going_offline(void *arg)
{
struct memory_notify *marg = arg;
unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
struct cmm_page_array *pa_curr, *pa_last, *npa;
unsigned long idx;
unsigned long freed = 0;
cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
start_page, marg->nr_pages);
spin_lock(&cmm_lock);
/* Search the page list for pages in the range to be offlined */
pa_last = pa_curr = cmm_page_list;
while (pa_curr) {
for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
if ((pa_curr->page[idx] < start_page) ||
(pa_curr->page[idx] >= end_page))
continue;
plpar_page_set_active(__pa(pa_curr->page[idx]));
free_page(pa_curr->page[idx]);
freed++;
loaned_pages--;
totalram_pages++;
pa_curr->page[idx] = pa_last->page[--pa_last->index];
if (pa_last->index == 0) {
if (pa_curr == pa_last)
pa_curr = pa_last->next;
pa_last = pa_last->next;
free_page((unsigned long)cmm_page_list);
cmm_page_list = pa_last;
continue;
}
}
pa_curr = pa_curr->next;
}
/* Search for page list structures in the range to be offlined */
pa_last = NULL;
pa_curr = cmm_page_list;
while (pa_curr) {
if (((unsigned long)pa_curr >= start_page) &&
((unsigned long)pa_curr < end_page)) {
npa = (struct cmm_page_array *)__get_free_page(
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!npa) {
spin_unlock(&cmm_lock);
cmm_dbg("Failed to allocate memory for list "
"management. Memory hotplug "
"failed.\n");
return ENOMEM;
}
memcpy(npa, pa_curr, PAGE_SIZE);
if (pa_curr == cmm_page_list)
cmm_page_list = npa;
if (pa_last)
pa_last->next = npa;
free_page((unsigned long) pa_curr);
freed++;
pa_curr = npa;
}
pa_last = pa_curr;
pa_curr = pa_curr->next;
}
spin_unlock(&cmm_lock);
cmm_dbg("Released %ld pages in the search range.\n", freed);
return 0;
}
/**
* cmm_memory_cb - Handle memory hotplug notifier calls
* @self: notifier block struct
* @action: action to take
* @arg: struct memory_notify data for handler
*
* Return value:
* NOTIFY_OK or notifier error based on subfunction return value
*
**/
static int cmm_memory_cb(struct notifier_block *self,
unsigned long action, void *arg)
{
int ret = 0;
switch (action) {
case MEM_GOING_OFFLINE:
mutex_lock(&hotplug_mutex);
hotplug_occurred = 1;
ret = cmm_mem_going_offline(arg);
break;
case MEM_OFFLINE:
case MEM_CANCEL_OFFLINE:
mutex_unlock(&hotplug_mutex);
cmm_dbg("Memory offline operation complete.\n");
break;
case MEM_GOING_ONLINE:
case MEM_ONLINE:
case MEM_CANCEL_ONLINE:
break;
}
return notifier_from_errno(ret);
}
static struct notifier_block cmm_mem_nb = {
.notifier_call = cmm_memory_cb,
.priority = CMM_MEM_HOTPLUG_PRI
};
/**
* cmm_init - Module initialization
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_init(void)
{
int rc = -ENOMEM;
if (!firmware_has_feature(FW_FEATURE_CMO))
return -EOPNOTSUPP;
if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
return rc;
if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
goto out_oom_notifier;
if ((rc = cmm_sysfs_register(&cmm_dev)))
goto out_reboot_notifier;
if (register_memory_notifier(&cmm_mem_nb) ||
register_memory_isolate_notifier(&cmm_mem_isolate_nb))
goto out_unregister_notifier;
if (cmm_disabled)
return rc;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
goto out_unregister_notifier;
}
return rc;
out_unregister_notifier:
unregister_memory_notifier(&cmm_mem_nb);
unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_dev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
out_oom_notifier:
unregister_oom_notifier(&cmm_oom_nb);
return rc;
}
/**
* cmm_exit - Module exit
*
* Return value:
* nothing
**/
static void cmm_exit(void)
{
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
unregister_memory_notifier(&cmm_mem_nb);
unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_dev);
}
/**
* cmm_set_disable - Disable/Enable CMM
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_set_disable(const char *val, struct kernel_param *kp)
{
int disable = simple_strtoul(val, NULL, 10);
if (disable != 0 && disable != 1)
return -EINVAL;
if (disable && !cmm_disabled) {
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
cmm_thread_ptr = NULL;
cmm_free_pages(loaned_pages);
} else if (!disable && cmm_disabled) {
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr))
return PTR_ERR(cmm_thread_ptr);
}
cmm_disabled = disable;
return 0;
}
module_param_call(disable, cmm_set_disable, param_get_uint,
&cmm_disabled, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
"[Default=" __stringify(CMM_DISABLE) "]");
module_init(cmm_init);
module_exit(cmm_exit);
@@ -0,0 +1,565 @@
/*
* Support for dynamic reconfiguration for PCI, Memory, and CPU
* Hotplug and Dynamic Logical Partitioning on RPA platforms.
*
* Copyright (C) 2009 Nathan Fontenot
* Copyright (C) 2009 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include "offline_states.h"
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/uaccess.h>
#include <asm/rtas.h>
#include <asm/pSeries_reconfig.h>
struct cc_workarea {
u32 drc_index;
u32 zero;
u32 name_offset;
u32 prop_length;
u32 prop_offset;
};
void dlpar_free_cc_property(struct property *prop)
{
kfree(prop->name);
kfree(prop->value);
kfree(prop);
}
static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
{
struct property *prop;
char *name;
char *value;
prop = kzalloc(sizeof(*prop), GFP_KERNEL);
if (!prop)
return NULL;
name = (char *)ccwa + ccwa->name_offset;
prop->name = kstrdup(name, GFP_KERNEL);
prop->length = ccwa->prop_length;
value = (char *)ccwa + ccwa->prop_offset;
prop->value = kmemdup(value, prop->length, GFP_KERNEL);
if (!prop->value) {
dlpar_free_cc_property(prop);
return NULL;
}
return prop;
}
static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
{
struct device_node *dn;
char *name;
dn = kzalloc(sizeof(*dn), GFP_KERNEL);
if (!dn)
return NULL;
/* The configure connector reported name does not contain a
* preceding '/', so we allocate a buffer large enough to
* prepend this to the full_name.
*/
name = (char *)ccwa + ccwa->name_offset;
dn->full_name = kasprintf(GFP_KERNEL, "/%s", name);
if (!dn->full_name) {
kfree(dn);
return NULL;
}
return dn;
}
static void dlpar_free_one_cc_node(struct device_node *dn)
{
struct property *prop;
while (dn->properties) {
prop = dn->properties;
dn->properties = prop->next;
dlpar_free_cc_property(prop);
}
kfree(dn->full_name);
kfree(dn);
}
void dlpar_free_cc_nodes(struct device_node *dn)
{
if (dn->child)
dlpar_free_cc_nodes(dn->child);
if (dn->sibling)
dlpar_free_cc_nodes(dn->sibling);
dlpar_free_one_cc_node(dn);
}
#define COMPLETE 0
#define NEXT_SIBLING 1
#define NEXT_CHILD 2
#define NEXT_PROPERTY 3
#define PREV_PARENT 4
#define MORE_MEMORY 5
#define CALL_AGAIN -2
#define ERR_CFG_USE -9003
struct device_node *dlpar_configure_connector(u32 drc_index)
{
struct device_node *dn;
struct device_node *first_dn = NULL;
struct device_node *last_dn = NULL;
struct property *property;
struct property *last_property = NULL;
struct cc_workarea *ccwa;
char *data_buf;
int cc_token;
int rc = -1;
cc_token = rtas_token("ibm,configure-connector");
if (cc_token == RTAS_UNKNOWN_SERVICE)
return NULL;
data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!data_buf)
return NULL;
ccwa = (struct cc_workarea *)&data_buf[0];
ccwa->drc_index = drc_index;
ccwa->zero = 0;
do {
/* Since we release the rtas_data_buf lock between configure
* connector calls we want to re-populate the rtas_data_buffer
* with the contents of the previous call.
*/
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
switch (rc) {
case COMPLETE:
break;
case NEXT_SIBLING:
dn = dlpar_parse_cc_node(ccwa);
if (!dn)
goto cc_error;
dn->parent = last_dn->parent;
last_dn->sibling = dn;
last_dn = dn;
break;
case NEXT_CHILD:
dn = dlpar_parse_cc_node(ccwa);
if (!dn)
goto cc_error;
if (!first_dn)
first_dn = dn;
else {
dn->parent = last_dn;
if (last_dn)
last_dn->child = dn;
}
last_dn = dn;
break;
case NEXT_PROPERTY:
property = dlpar_parse_cc_property(ccwa);
if (!property)
goto cc_error;
if (!last_dn->properties)
last_dn->properties = property;
else
last_property->next = property;
last_property = property;
break;
case PREV_PARENT:
last_dn = last_dn->parent;
break;
case CALL_AGAIN:
break;
case MORE_MEMORY:
case ERR_CFG_USE:
default:
printk(KERN_ERR "Unexpected Error (%d) "
"returned from configure-connector\n", rc);
goto cc_error;
}
} while (rc);
cc_error:
kfree(data_buf);
if (rc) {
if (first_dn)
dlpar_free_cc_nodes(first_dn);
return NULL;
}
return first_dn;
}
static struct device_node *derive_parent(const char *path)
{
struct device_node *parent;
char *last_slash;
last_slash = strrchr(path, '/');
if (last_slash == path) {
parent = of_find_node_by_path("/");
} else {
char *parent_path;
int parent_path_len = last_slash - path + 1;
parent_path = kmalloc(parent_path_len, GFP_KERNEL);
if (!parent_path)
return NULL;
strlcpy(parent_path, path, parent_path_len);
parent = of_find_node_by_path(parent_path);
kfree(parent_path);
}
return parent;
}
int dlpar_attach_node(struct device_node *dn)
{
#ifdef CONFIG_PROC_DEVICETREE
struct proc_dir_entry *ent;
#endif
int rc;
of_node_set_flag(dn, OF_DYNAMIC);
kref_init(&dn->kref);
dn->parent = derive_parent(dn->full_name);
if (!dn->parent)
return -ENOMEM;
rc = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, dn);
if (rc) {
printk(KERN_ERR "Failed to add device node %s\n",
dn->full_name);
return rc;
}
of_attach_node(dn);
#ifdef CONFIG_PROC_DEVICETREE
ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
if (ent)
proc_device_tree_add_node(dn, ent);
#endif
of_node_put(dn->parent);
return 0;
}
int dlpar_detach_node(struct device_node *dn)
{
#ifdef CONFIG_PROC_DEVICETREE
struct device_node *parent = dn->parent;
struct property *prop = dn->properties;
while (prop) {
remove_proc_entry(prop->name, dn->pde);
prop = prop->next;
}
if (dn->pde)
remove_proc_entry(dn->pde->name, parent->pde);
#endif
pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn);
of_detach_node(dn);
of_node_put(dn); /* Must decrement the refcount */
return 0;
}
#define DR_ENTITY_SENSE 9003
#define DR_ENTITY_PRESENT 1
#define DR_ENTITY_UNUSABLE 2
#define ALLOCATION_STATE 9003
#define ALLOC_UNUSABLE 0
#define ALLOC_USABLE 1
#define ISOLATION_STATE 9001
#define ISOLATE 0
#define UNISOLATE 1
int dlpar_acquire_drc(u32 drc_index)
{
int dr_status, rc;
rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
DR_ENTITY_SENSE, drc_index);
if (rc || dr_status != DR_ENTITY_UNUSABLE)
return -1;
rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
if (rc)
return rc;
rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
if (rc) {
rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
return rc;
}
return 0;
}
int dlpar_release_drc(u32 drc_index)
{
int dr_status, rc;
rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
DR_ENTITY_SENSE, drc_index);
if (rc || dr_status != DR_ENTITY_PRESENT)
return -1;
rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
if (rc)
return rc;
rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
if (rc) {
rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
return rc;
}
return 0;
}
#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
static int dlpar_online_cpu(struct device_node *dn)
{
int rc = 0;
unsigned int cpu;
int len, nthreads, i;
const u32 *intserv;
intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return -EINVAL;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != intserv[i])
continue;
BUG_ON(get_cpu_current_state(cpu)
!= CPU_STATE_OFFLINE);
cpu_maps_update_done();
rc = cpu_up(cpu);
if (rc)
goto out;
cpu_maps_update_begin();
break;
}
if (cpu == num_possible_cpus())
printk(KERN_WARNING "Could not find cpu to online "
"with physical id 0x%x\n", intserv[i]);
}
cpu_maps_update_done();
out:
return rc;
}
static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
{
struct device_node *dn;
unsigned long drc_index;
char *cpu_name;
int rc;
cpu_hotplug_driver_lock();
rc = strict_strtoul(buf, 0, &drc_index);
if (rc) {
rc = -EINVAL;
goto out;
}
dn = dlpar_configure_connector(drc_index);
if (!dn) {
rc = -EINVAL;
goto out;
}
/* configure-connector reports cpus as living in the base
* directory of the device tree. CPUs actually live in the
* cpus directory so we need to fixup the full_name.
*/
cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name);
if (!cpu_name) {
dlpar_free_cc_nodes(dn);
rc = -ENOMEM;
goto out;
}
kfree(dn->full_name);
dn->full_name = cpu_name;
rc = dlpar_acquire_drc(drc_index);
if (rc) {
dlpar_free_cc_nodes(dn);
rc = -EINVAL;
goto out;
}
rc = dlpar_attach_node(dn);
if (rc) {
dlpar_release_drc(drc_index);
dlpar_free_cc_nodes(dn);
goto out;
}
rc = dlpar_online_cpu(dn);
out:
cpu_hotplug_driver_unlock();
return rc ? rc : count;
}
static int dlpar_offline_cpu(struct device_node *dn)
{
int rc = 0;
unsigned int cpu;
int len, nthreads, i;
const u32 *intserv;
intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return -EINVAL;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != intserv[i])
continue;
if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
break;
if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
cpu_maps_update_done();
rc = cpu_down(cpu);
if (rc)
goto out;
cpu_maps_update_begin();
break;
}
/*
* The cpu is in CPU_STATE_INACTIVE.
* Upgrade it's state to CPU_STATE_OFFLINE.
*/
set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
BUG_ON(plpar_hcall_norets(H_PROD, intserv[i])
!= H_SUCCESS);
__cpu_die(cpu);
break;
}
if (cpu == num_possible_cpus())
printk(KERN_WARNING "Could not find cpu to offline "
"with physical id 0x%x\n", intserv[i]);
}
cpu_maps_update_done();
out:
return rc;
}
static ssize_t dlpar_cpu_release(const char *buf, size_t count)
{
struct device_node *dn;
const u32 *drc_index;
int rc;
dn = of_find_node_by_path(buf);
if (!dn)
return -EINVAL;
drc_index = of_get_property(dn, "ibm,my-drc-index", NULL);
if (!drc_index) {
of_node_put(dn);
return -EINVAL;
}
cpu_hotplug_driver_lock();
rc = dlpar_offline_cpu(dn);
if (rc) {
of_node_put(dn);
rc = -EINVAL;
goto out;
}
rc = dlpar_release_drc(*drc_index);
if (rc) {
of_node_put(dn);
goto out;
}
rc = dlpar_detach_node(dn);
if (rc) {
dlpar_acquire_drc(*drc_index);
goto out;
}
of_node_put(dn);
out:
cpu_hotplug_driver_unlock();
return rc ? rc : count;
}
static int __init pseries_dlpar_init(void)
{
ppc_md.cpu_probe = dlpar_cpu_probe;
ppc_md.cpu_release = dlpar_cpu_release;
return 0;
}
machine_device_initcall(pseries, pseries_dlpar_init);
#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+396
View File
@@ -0,0 +1,396 @@
/*
* Virtual Processor Dispatch Trace Log
*
* (C) Copyright IBM Corporation 2009
*
* Author: Jeremy Kerr <jk@ozlabs.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <asm/smp.h>
#include <asm/uaccess.h>
#include <asm/firmware.h>
#include <asm/lppaca.h>
#include <asm/debug.h>
#include "plpar_wrappers.h"
struct dtl {
struct dtl_entry *buf;
struct dentry *file;
int cpu;
int buf_entries;
u64 last_idx;
spinlock_t lock;
};
static DEFINE_PER_CPU(struct dtl, cpu_dtl);
/*
* Dispatch trace log event mask:
* 0x7: 0x1: voluntary virtual processor waits
* 0x2: time-slice preempts
* 0x4: virtual partition memory page faults
*/
static u8 dtl_event_mask = 0x7;
/*
* Size of per-cpu log buffers. Firmware requires that the buffer does
* not cross a 4k boundary.
*/
static int dtl_buf_entries = N_DISPATCH_LOG;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
struct dtl_ring {
u64 write_index;
struct dtl_entry *write_ptr;
struct dtl_entry *buf;
struct dtl_entry *buf_end;
u8 saved_dtl_mask;
};
static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
static atomic_t dtl_count;
/*
* The cpu accounting code controls the DTL ring buffer, and we get
* given entries as they are processed.
*/
static void consume_dtle(struct dtl_entry *dtle, u64 index)
{
struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings);
struct dtl_entry *wp = dtlr->write_ptr;
struct lppaca *vpa = local_paca->lppaca_ptr;
if (!wp)
return;
*wp = *dtle;
barrier();
/* check for hypervisor ring buffer overflow, ignore this entry if so */
if (index + N_DISPATCH_LOG < vpa->dtl_idx)
return;
++wp;
if (wp == dtlr->buf_end)
wp = dtlr->buf;
dtlr->write_ptr = wp;
/* incrementing write_index makes the new entry visible */
smp_wmb();
++dtlr->write_index;
}
static int dtl_start(struct dtl *dtl)
{
struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
dtlr->buf = dtl->buf;
dtlr->buf_end = dtl->buf + dtl->buf_entries;
dtlr->write_index = 0;
/* setting write_ptr enables logging into our buffer */
smp_wmb();
dtlr->write_ptr = dtl->buf;
/* enable event logging */
dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
dtl_consumer = consume_dtle;
atomic_inc(&dtl_count);
return 0;
}
static void dtl_stop(struct dtl *dtl)
{
struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
dtlr->write_ptr = NULL;
smp_wmb();
dtlr->buf = NULL;
/* restore dtl_enable_mask */
lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
if (atomic_dec_and_test(&dtl_count))
dtl_consumer = NULL;
}
static u64 dtl_current_index(struct dtl *dtl)
{
return per_cpu(dtl_rings, dtl->cpu).write_index;
}
#else /* CONFIG_VIRT_CPU_ACCOUNTING */
static int dtl_start(struct dtl *dtl)
{
unsigned long addr;
int ret, hwcpu;
/* Register our dtl buffer with the hypervisor. The HV expects the
* buffer size to be passed in the second word of the buffer */
((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
hwcpu = get_hard_smp_processor_id(dtl->cpu);
addr = __pa(dtl->buf);
ret = register_dtl(hwcpu, addr);
if (ret) {
printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
"failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
return -EIO;
}
/* set our initial buffer indices */
lppaca_of(dtl->cpu).dtl_idx = 0;
/* ensure that our updates to the lppaca fields have occurred before
* we actually enable the logging */
smp_wmb();
/* enable event logging */
lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
return 0;
}
static void dtl_stop(struct dtl *dtl)
{
int hwcpu = get_hard_smp_processor_id(dtl->cpu);
lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
unregister_dtl(hwcpu);
}
static u64 dtl_current_index(struct dtl *dtl)
{
return lppaca_of(dtl->cpu).dtl_idx;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
static int dtl_enable(struct dtl *dtl)
{
long int n_entries;
long int rc;
struct dtl_entry *buf = NULL;
if (!dtl_cache)
return -ENOMEM;
/* only allow one reader */
if (dtl->buf)
return -EBUSY;
n_entries = dtl_buf_entries;
buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
if (!buf) {
printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
__func__, dtl->cpu);
return -ENOMEM;
}
spin_lock(&dtl->lock);
rc = -EBUSY;
if (!dtl->buf) {
/* store the original allocation size for use during read */
dtl->buf_entries = n_entries;
dtl->buf = buf;
dtl->last_idx = 0;
rc = dtl_start(dtl);
if (rc)
dtl->buf = NULL;
}
spin_unlock(&dtl->lock);
if (rc)
kmem_cache_free(dtl_cache, buf);
return rc;
}
static void dtl_disable(struct dtl *dtl)
{
spin_lock(&dtl->lock);
dtl_stop(dtl);
kmem_cache_free(dtl_cache, dtl->buf);
dtl->buf = NULL;
dtl->buf_entries = 0;
spin_unlock(&dtl->lock);
}
/* file interface */
static int dtl_file_open(struct inode *inode, struct file *filp)
{
struct dtl *dtl = inode->i_private;
int rc;
rc = dtl_enable(dtl);
if (rc)
return rc;
filp->private_data = dtl;
return 0;
}
static int dtl_file_release(struct inode *inode, struct file *filp)
{
struct dtl *dtl = inode->i_private;
dtl_disable(dtl);
return 0;
}
static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
long int rc, n_read, n_req, read_size;
struct dtl *dtl;
u64 cur_idx, last_idx, i;
if ((len % sizeof(struct dtl_entry)) != 0)
return -EINVAL;
dtl = filp->private_data;
/* requested number of entries to read */
n_req = len / sizeof(struct dtl_entry);
/* actual number of entries read */
n_read = 0;
spin_lock(&dtl->lock);
cur_idx = dtl_current_index(dtl);
last_idx = dtl->last_idx;
if (last_idx + dtl->buf_entries <= cur_idx)
last_idx = cur_idx - dtl->buf_entries + 1;
if (last_idx + n_req > cur_idx)
n_req = cur_idx - last_idx;
if (n_req > 0)
dtl->last_idx = last_idx + n_req;
spin_unlock(&dtl->lock);
if (n_req <= 0)
return 0;
i = last_idx % dtl->buf_entries;
/* read the tail of the buffer if we've wrapped */
if (i + n_req > dtl->buf_entries) {
read_size = dtl->buf_entries - i;
rc = copy_to_user(buf, &dtl->buf[i],
read_size * sizeof(struct dtl_entry));
if (rc)
return -EFAULT;
i = 0;
n_req -= read_size;
n_read += read_size;
buf += read_size * sizeof(struct dtl_entry);
}
/* .. and now the head */
rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
if (rc)
return -EFAULT;
n_read += n_req;
return n_read * sizeof(struct dtl_entry);
}
static const struct file_operations dtl_fops = {
.open = dtl_file_open,
.release = dtl_file_release,
.read = dtl_file_read,
.llseek = no_llseek,
};
static struct dentry *dtl_dir;
static int dtl_setup_file(struct dtl *dtl)
{
char name[10];
sprintf(name, "cpu-%d", dtl->cpu);
dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
if (!dtl->file)
return -ENOMEM;
return 0;
}
static int dtl_init(void)
{
struct dentry *event_mask_file, *buf_entries_file;
int rc, i;
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return -ENODEV;
/* set up common debugfs structure */
rc = -ENOMEM;
dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
if (!dtl_dir) {
printk(KERN_WARNING "%s: can't create dtl root dir\n",
__func__);
goto err;
}
event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
dtl_dir, &dtl_event_mask);
buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
dtl_dir, &dtl_buf_entries);
if (!event_mask_file || !buf_entries_file) {
printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
goto err_remove_dir;
}
/* set up the per-cpu log structures */
for_each_possible_cpu(i) {
struct dtl *dtl = &per_cpu(cpu_dtl, i);
spin_lock_init(&dtl->lock);
dtl->cpu = i;
rc = dtl_setup_file(dtl);
if (rc)
goto err_remove_dir;
}
return 0;
err_remove_dir:
debugfs_remove_recursive(dtl_dir);
err:
return rc;
}
arch_initcall(dtl_init);
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,318 @@
/*
* PCI address cache; allows the lookup of PCI devices based on I/O address
*
* Copyright IBM Corporation 2004
* Copyright Linas Vepstas <linas@austin.ibm.com> 2004
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
/**
* The pci address cache subsystem. This subsystem places
* PCI device address resources into a red-black tree, sorted
* according to the address range, so that given only an i/o
* address, the corresponding PCI device can be **quickly**
* found. It is safe to perform an address lookup in an interrupt
* context; this ability is an important feature.
*
* Currently, the only customer of this code is the EEH subsystem;
* thus, this code has been somewhat tailored to suit EEH better.
* In particular, the cache does *not* hold the addresses of devices
* for which EEH is not enabled.
*
* (Implementation Note: The RB tree seems to be better/faster
* than any hash algo I could think of for this problem, even
* with the penalty of slow pointer chases for d-cache misses).
*/
struct pci_io_addr_range {
struct rb_node rb_node;
unsigned long addr_lo;
unsigned long addr_hi;
struct pci_dev *pcidev;
unsigned int flags;
};
static struct pci_io_addr_cache {
struct rb_root rb_root;
spinlock_t piar_lock;
} pci_io_addr_cache_root;
static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
{
struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
while (n) {
struct pci_io_addr_range *piar;
piar = rb_entry(n, struct pci_io_addr_range, rb_node);
if (addr < piar->addr_lo) {
n = n->rb_left;
} else {
if (addr > piar->addr_hi) {
n = n->rb_right;
} else {
pci_dev_get(piar->pcidev);
return piar->pcidev;
}
}
}
return NULL;
}
/**
* pci_addr_cache_get_device - Get device, given only address
* @addr: mmio (PIO) phys address or i/o port number
*
* Given an mmio phys address, or a port number, find a pci device
* that implements this address. Be sure to pci_dev_put the device
* when finished. I/O port numbers are assumed to be offset
* from zero (that is, they do *not* have pci_io_addr added in).
* It is safe to call this function within an interrupt.
*/
struct pci_dev *pci_addr_cache_get_device(unsigned long addr)
{
struct pci_dev *dev;
unsigned long flags;
spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
dev = __pci_addr_cache_get_device(addr);
spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
return dev;
}
#ifdef DEBUG
/*
* Handy-dandy debug print routine, does nothing more
* than print out the contents of our addr cache.
*/
static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
{
struct rb_node *n;
int cnt = 0;
n = rb_first(&cache->rb_root);
while (n) {
struct pci_io_addr_range *piar;
piar = rb_entry(n, struct pci_io_addr_range, rb_node);
printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
(piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
cnt++;
n = rb_next(n);
}
}
#endif
/* Insert address range into the rb tree. */
static struct pci_io_addr_range *
pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
unsigned long ahi, unsigned int flags)
{
struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct pci_io_addr_range *piar;
/* Walk tree, find a place to insert into tree */
while (*p) {
parent = *p;
piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
if (ahi < piar->addr_lo) {
p = &parent->rb_left;
} else if (alo > piar->addr_hi) {
p = &parent->rb_right;
} else {
if (dev != piar->pcidev ||
alo != piar->addr_lo || ahi != piar->addr_hi) {
printk(KERN_WARNING "PIAR: overlapping address range\n");
}
return piar;
}
}
piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
if (!piar)
return NULL;
pci_dev_get(dev);
piar->addr_lo = alo;
piar->addr_hi = ahi;
piar->pcidev = dev;
piar->flags = flags;
#ifdef DEBUG
printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
alo, ahi, pci_name(dev));
#endif
rb_link_node(&piar->rb_node, parent, p);
rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
return piar;
}
static void __pci_addr_cache_insert_device(struct pci_dev *dev)
{
struct device_node *dn;
struct eeh_dev *edev;
int i;
dn = pci_device_to_OF_node(dev);
if (!dn) {
printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
return;
}
edev = of_node_to_eeh_dev(dn);
if (!edev) {
pr_warning("PCI: no EEH dev found for dn=%s\n",
dn->full_name);
return;
}
/* Skip any devices for which EEH is not enabled. */
if (!(edev->mode & EEH_MODE_SUPPORTED) ||
edev->mode & EEH_MODE_NOCHECK) {
#ifdef DEBUG
pr_info("PCI: skip building address cache for=%s - %s\n",
pci_name(dev), dn->full_name);
#endif
return;
}
/* Walk resources on this device, poke them into the tree */
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
unsigned long start = pci_resource_start(dev,i);
unsigned long end = pci_resource_end(dev,i);
unsigned int flags = pci_resource_flags(dev,i);
/* We are interested only bus addresses, not dma or other stuff */
if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
continue;
if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
continue;
pci_addr_cache_insert(dev, start, end, flags);
}
}
/**
* pci_addr_cache_insert_device - Add a device to the address cache
* @dev: PCI device whose I/O addresses we are interested in.
*
* In order to support the fast lookup of devices based on addresses,
* we maintain a cache of devices that can be quickly searched.
* This routine adds a device to that cache.
*/
void pci_addr_cache_insert_device(struct pci_dev *dev)
{
unsigned long flags;
/* Ignore PCI bridges */
if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
return;
spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
__pci_addr_cache_insert_device(dev);
spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
}
static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
{
struct rb_node *n;
restart:
n = rb_first(&pci_io_addr_cache_root.rb_root);
while (n) {
struct pci_io_addr_range *piar;
piar = rb_entry(n, struct pci_io_addr_range, rb_node);
if (piar->pcidev == dev) {
rb_erase(n, &pci_io_addr_cache_root.rb_root);
pci_dev_put(piar->pcidev);
kfree(piar);
goto restart;
}
n = rb_next(n);
}
}
/**
* pci_addr_cache_remove_device - remove pci device from addr cache
* @dev: device to remove
*
* Remove a device from the addr-cache tree.
* This is potentially expensive, since it will walk
* the tree multiple times (once per resource).
* But so what; device removal doesn't need to be that fast.
*/
void pci_addr_cache_remove_device(struct pci_dev *dev)
{
unsigned long flags;
spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
__pci_addr_cache_remove_device(dev);
spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
}
/**
* pci_addr_cache_build - Build a cache of I/O addresses
*
* Build a cache of pci i/o addresses. This cache will be used to
* find the pci device that corresponds to a given address.
* This routine scans all pci busses to build the cache.
* Must be run late in boot process, after the pci controllers
* have been scanned for devices (after all device resources are known).
*/
void __init pci_addr_cache_build(void)
{
struct device_node *dn;
struct eeh_dev *edev;
struct pci_dev *dev = NULL;
spin_lock_init(&pci_io_addr_cache_root.piar_lock);
for_each_pci_dev(dev) {
pci_addr_cache_insert_device(dev);
dn = pci_device_to_OF_node(dev);
if (!dn)
continue;
edev = of_node_to_eeh_dev(dn);
if (!edev)
continue;
pci_dev_get(dev); /* matching put is in eeh_remove_device() */
dev->dev.archdata.edev = edev;
edev->pdev = dev;
eeh_sysfs_add_device(dev);
}
#ifdef DEBUG
/* Verify tree built up above, echo back the list of addrs. */
pci_addr_cache_print(&pci_io_addr_cache_root);
#endif
}
@@ -0,0 +1,102 @@
/*
* The file intends to implement dynamic creation of EEH device, which will
* be bound with OF node and PCI device simutaneously. The EEH devices would
* be foundamental information for EEH core components to work proerly. Besides,
* We have to support multiple situations where dynamic creation of EEH device
* is required:
*
* 1) Before PCI emunation starts, we need create EEH devices according to the
* PCI sensitive OF nodes.
* 2) When PCI emunation is done, we need do the binding between PCI device and
* the associated EEH device.
* 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
* will be created while PCI sensitive OF node is detected from DR.
* 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
* PHB is newly inserted, we also need create EEH devices accordingly.
*
* Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/string.h>
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
/**
* eeh_dev_init - Create EEH device according to OF node
* @dn: device node
* @data: PHB
*
* It will create EEH device according to the given OF node. The function
* might be called by PCI emunation, DR, PHB hotplug.
*/
void * __devinit eeh_dev_init(struct device_node *dn, void *data)
{
struct pci_controller *phb = data;
struct eeh_dev *edev;
/* Allocate EEH device */
edev = zalloc_maybe_bootmem(sizeof(*edev), GFP_KERNEL);
if (!edev) {
pr_warning("%s: out of memory\n", __func__);
return NULL;
}
/* Associate EEH device with OF node */
PCI_DN(dn)->edev = edev;
edev->dn = dn;
edev->phb = phb;
return NULL;
}
/**
* eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
* @phb: PHB
*
* Scan the PHB OF node and its child association, then create the
* EEH devices accordingly
*/
void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb)
{
struct device_node *dn = phb->dn;
/* EEH device for PHB */
eeh_dev_init(dn, phb);
/* EEH devices for children OF nodes */
traverse_pci_devices(dn, eeh_dev_init, phb);
}
/**
* eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
*
* Scan all the existing PHBs and create EEH devices for their OF
* nodes and their children OF nodes
*/
void __init eeh_dev_phb_init(void)
{
struct pci_controller *phb, *tmp;
list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
eeh_dev_phb_init_dynamic(phb);
}
@@ -0,0 +1,538 @@
/*
* PCI Error Recovery Driver for RPA-compliant PPC64 platform.
* Copyright IBM Corp. 2004 2005
* Copyright Linas Vepstas <linas@linas.org> 2004, 2005
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
*/
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/pci.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
#include <asm/pci-bridge.h>
#include <asm/prom.h>
#include <asm/rtas.h>
/**
* eeh_pcid_name - Retrieve name of PCI device driver
* @pdev: PCI device
*
* This routine is used to retrieve the name of PCI device driver
* if that's valid.
*/
static inline const char *eeh_pcid_name(struct pci_dev *pdev)
{
if (pdev && pdev->dev.driver)
return pdev->dev.driver->name;
return "";
}
#if 0
static void print_device_node_tree(struct pci_dn *pdn, int dent)
{
int i;
struct device_node *pc;
if (!pdn)
return;
for (i = 0; i < dent; i++)
printk(" ");
printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
pdn->eeh_pe_config_addr, pdn->node->full_name);
dent += 3;
pc = pdn->node->child;
while (pc) {
print_device_node_tree(PCI_DN(pc), dent);
pc = pc->sibling;
}
}
#endif
/**
* eeh_disable_irq - Disable interrupt for the recovering device
* @dev: PCI device
*
* This routine must be called when reporting temporary or permanent
* error to the particular PCI device to disable interrupt of that
* device. If the device has enabled MSI or MSI-X interrupt, we needn't
* do real work because EEH should freeze DMA transfers for those PCI
* devices encountering EEH errors, which includes MSI or MSI-X.
*/
static void eeh_disable_irq(struct pci_dev *dev)
{
struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
/* Don't disable MSI and MSI-X interrupts. They are
* effectively disabled by the DMA Stopped state
* when an EEH error occurs.
*/
if (dev->msi_enabled || dev->msix_enabled)
return;
if (!irq_has_action(dev->irq))
return;
edev->mode |= EEH_MODE_IRQ_DISABLED;
disable_irq_nosync(dev->irq);
}
/**
* eeh_enable_irq - Enable interrupt for the recovering device
* @dev: PCI device
*
* This routine must be called to enable interrupt while failed
* device could be resumed.
*/
static void eeh_enable_irq(struct pci_dev *dev)
{
struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
if ((edev->mode) & EEH_MODE_IRQ_DISABLED) {
edev->mode &= ~EEH_MODE_IRQ_DISABLED;
enable_irq(dev->irq);
}
}
/**
* eeh_report_error - Report pci error to each device driver
* @dev: PCI device
* @userdata: return value
*
* Report an EEH error to each device driver, collect up and
* merge the device driver responses. Cumulative response
* passed back in "userdata".
*/
static int eeh_report_error(struct pci_dev *dev, void *userdata)
{
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver = dev->driver;
dev->error_state = pci_channel_io_frozen;
if (!driver)
return 0;
eeh_disable_irq(dev);
if (!driver->err_handler ||
!driver->err_handler->error_detected)
return 0;
rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
/* A driver that needs a reset trumps all others */
if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
return 0;
}
/**
* eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
* @dev: PCI device
* @userdata: return value
*
* Tells each device driver that IO ports, MMIO and config space I/O
* are now enabled. Collects up and merges the device driver responses.
* Cumulative response passed back in "userdata".
*/
static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
{
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver = dev->driver;
if (!driver ||
!driver->err_handler ||
!driver->err_handler->mmio_enabled)
return 0;
rc = driver->err_handler->mmio_enabled(dev);
/* A driver that needs a reset trumps all others */
if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
return 0;
}
/**
* eeh_report_reset - Tell device that slot has been reset
* @dev: PCI device
* @userdata: return value
*
* This routine must be called while EEH tries to reset particular
* PCI device so that the associated PCI device driver could take
* some actions, usually to save data the driver needs so that the
* driver can work again while the device is recovered.
*/
static int eeh_report_reset(struct pci_dev *dev, void *userdata)
{
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver = dev->driver;
if (!driver)
return 0;
dev->error_state = pci_channel_io_normal;
eeh_enable_irq(dev);
if (!driver->err_handler ||
!driver->err_handler->slot_reset)
return 0;
rc = driver->err_handler->slot_reset(dev);
if ((*res == PCI_ERS_RESULT_NONE) ||
(*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
if (*res == PCI_ERS_RESULT_DISCONNECT &&
rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
return 0;
}
/**
* eeh_report_resume - Tell device to resume normal operations
* @dev: PCI device
* @userdata: return value
*
* This routine must be called to notify the device driver that it
* could resume so that the device driver can do some initialization
* to make the recovered device work again.
*/
static int eeh_report_resume(struct pci_dev *dev, void *userdata)
{
struct pci_driver *driver = dev->driver;
dev->error_state = pci_channel_io_normal;
if (!driver)
return 0;
eeh_enable_irq(dev);
if (!driver->err_handler ||
!driver->err_handler->resume)
return 0;
driver->err_handler->resume(dev);
return 0;
}
/**
* eeh_report_failure - Tell device driver that device is dead.
* @dev: PCI device
* @userdata: return value
*
* This informs the device driver that the device is permanently
* dead, and that no further recovery attempts will be made on it.
*/
static int eeh_report_failure(struct pci_dev *dev, void *userdata)
{
struct pci_driver *driver = dev->driver;
dev->error_state = pci_channel_io_perm_failure;
if (!driver)
return 0;
eeh_disable_irq(dev);
if (!driver->err_handler ||
!driver->err_handler->error_detected)
return 0;
driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
return 0;
}
/**
* eeh_reset_device - Perform actual reset of a pci slot
* @edev: PE associated EEH device
* @bus: PCI bus corresponding to the isolcated slot
*
* This routine must be called to do reset on the indicated PE.
* During the reset, udev might be invoked because those affected
* PCI devices will be removed and then added.
*/
static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
{
struct device_node *dn;
int cnt, rc;
/* pcibios will clear the counter; save the value */
cnt = edev->freeze_count;
if (bus)
pcibios_remove_pci_devices(bus);
/* Reset the pci controller. (Asserts RST#; resets config space).
* Reconfigure bridges and devices. Don't try to bring the system
* up if the reset failed for some reason.
*/
rc = eeh_reset_pe(edev);
if (rc)
return rc;
/* Walk over all functions on this device. */
dn = eeh_dev_to_of_node(edev);
if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
dn = dn->parent->child;
while (dn) {
struct eeh_dev *pedev = of_node_to_eeh_dev(dn);
/* On Power4, always true because eeh_pe_config_addr=0 */
if (edev->pe_config_addr == pedev->pe_config_addr) {
eeh_ops->configure_bridge(dn);
eeh_restore_bars(pedev);
}
dn = dn->sibling;
}
/* Give the system 5 seconds to finish running the user-space
* hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
* this is a hack, but if we don't do this, and try to bring
* the device up before the scripts have taken it down,
* potentially weird things happen.
*/
if (bus) {
ssleep(5);
pcibios_add_pci_devices(bus);
}
edev->freeze_count = cnt;
return 0;
}
/* The longest amount of time to wait for a pci device
* to come back on line, in seconds.
*/
#define MAX_WAIT_FOR_RECOVERY 150
/**
* eeh_handle_event - Reset a PCI device after hard lockup.
* @event: EEH event
*
* While PHB detects address or data parity errors on particular PCI
* slot, the associated PE will be frozen. Besides, DMA's occurring
* to wild addresses (which usually happen due to bugs in device
* drivers or in PCI adapter firmware) can cause EEH error. #SERR,
* #PERR or other misc PCI-related errors also can trigger EEH errors.
*
* Recovery process consists of unplugging the device driver (which
* generated hotplug events to userspace), then issuing a PCI #RST to
* the device, then reconfiguring the PCI config space for all bridges
* & devices under this slot, and then finally restarting the device
* drivers (which cause a second set of hotplug events to go out to
* userspace).
*/
struct eeh_dev *handle_eeh_events(struct eeh_event *event)
{
struct device_node *frozen_dn;
struct eeh_dev *frozen_edev;
struct pci_bus *frozen_bus;
int rc = 0;
enum pci_ers_result result = PCI_ERS_RESULT_NONE;
const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev));
if (!frozen_dn) {
location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL);
location = location ? location : "unknown";
printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
"for location=%s pci addr=%s\n",
location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev)));
return NULL;
}
frozen_bus = pcibios_find_pci_bus(frozen_dn);
location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
location = location ? location : "unknown";
/* There are two different styles for coming up with the PE.
* In the old style, it was the highest EEH-capable device
* which was always an EADS pci bridge. In the new style,
* there might not be any EADS bridges, and even when there are,
* the firmware marks them as "EEH incapable". So another
* two-step is needed to find the pci bus..
*/
if (!frozen_bus)
frozen_bus = pcibios_find_pci_bus(frozen_dn->parent);
if (!frozen_bus) {
printk(KERN_ERR "EEH: Cannot find PCI bus "
"for location=%s dn=%s\n",
location, frozen_dn->full_name);
return NULL;
}
frozen_edev = of_node_to_eeh_dev(frozen_dn);
frozen_edev->freeze_count++;
pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev));
drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev));
if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES)
goto excess_failures;
printk(KERN_WARNING
"EEH: This PCI device has failed %d times in the last hour:\n",
frozen_edev->freeze_count);
if (frozen_edev->pdev) {
bus_pci_str = pci_name(frozen_edev->pdev);
bus_drv_str = eeh_pcid_name(frozen_edev->pdev);
printk(KERN_WARNING
"EEH: Bus location=%s driver=%s pci addr=%s\n",
location, bus_drv_str, bus_pci_str);
}
printk(KERN_WARNING
"EEH: Device location=%s driver=%s pci addr=%s\n",
location, drv_str, pci_str);
/* Walk the various device drivers attached to this slot through
* a reset sequence, giving each an opportunity to do what it needs
* to accomplish the reset. Each child gets a report of the
* status ... if any child can't handle the reset, then the entire
* slot is dlpar removed and added.
*/
pci_walk_bus(frozen_bus, eeh_report_error, &result);
/* Get the current PCI slot state. This can take a long time,
* sometimes over 3 seconds for certain systems.
*/
rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000);
if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
printk(KERN_WARNING "EEH: Permanent failure\n");
goto hard_fail;
}
/* Since rtas may enable MMIO when posting the error log,
* don't post the error log until after all dev drivers
* have been informed.
*/
eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP);
/* If all device drivers were EEH-unaware, then shut
* down all of the device drivers, and hope they
* go down willingly, without panicing the system.
*/
if (result == PCI_ERS_RESULT_NONE) {
rc = eeh_reset_device(frozen_edev, frozen_bus);
if (rc) {
printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
goto hard_fail;
}
}
/* If all devices reported they can proceed, then re-enable MMIO */
if (result == PCI_ERS_RESULT_CAN_RECOVER) {
rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO);
if (rc < 0)
goto hard_fail;
if (rc) {
result = PCI_ERS_RESULT_NEED_RESET;
} else {
result = PCI_ERS_RESULT_NONE;
pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
}
}
/* If all devices reported they can proceed, then re-enable DMA */
if (result == PCI_ERS_RESULT_CAN_RECOVER) {
rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA);
if (rc < 0)
goto hard_fail;
if (rc)
result = PCI_ERS_RESULT_NEED_RESET;
else
result = PCI_ERS_RESULT_RECOVERED;
}
/* If any device has a hard failure, then shut off everything. */
if (result == PCI_ERS_RESULT_DISCONNECT) {
printk(KERN_WARNING "EEH: Device driver gave up\n");
goto hard_fail;
}
/* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) {
rc = eeh_reset_device(frozen_edev, NULL);
if (rc) {
printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
goto hard_fail;
}
result = PCI_ERS_RESULT_NONE;
pci_walk_bus(frozen_bus, eeh_report_reset, &result);
}
/* All devices should claim they have recovered by now. */
if ((result != PCI_ERS_RESULT_RECOVERED) &&
(result != PCI_ERS_RESULT_NONE)) {
printk(KERN_WARNING "EEH: Not recovered\n");
goto hard_fail;
}
/* Tell all device drivers that they can resume operations */
pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
return frozen_edev;
excess_failures:
/*
* About 90% of all real-life EEH failures in the field
* are due to poorly seated PCI cards. Only 10% or so are
* due to actual, failed cards.
*/
printk(KERN_ERR
"EEH: PCI device at location=%s driver=%s pci addr=%s\n"
"has failed %d times in the last hour "
"and has been permanently disabled.\n"
"Please try reseating this device or replacing it.\n",
location, drv_str, pci_str, frozen_edev->freeze_count);
goto perm_error;
hard_fail:
printk(KERN_ERR
"EEH: Unable to recover from failure of PCI device "
"at location=%s driver=%s pci addr=%s\n"
"Please try reseating this device or replacing it.\n",
location, drv_str, pci_str);
perm_error:
eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */
pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
/* Shut down the device drivers for good. */
pcibios_remove_pci_devices(frozen_bus);
return NULL;
}
@@ -0,0 +1,159 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/
#include <linux/delay.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
/** Overview:
* EEH error states may be detected within exception handlers;
* however, the recovery processing needs to occur asynchronously
* in a normal kernel context and not an interrupt context.
* This pair of routines creates an event and queues it onto a
* work-queue, where a worker thread can drive recovery.
*/
/* EEH event workqueue setup. */
static DEFINE_SPINLOCK(eeh_eventlist_lock);
LIST_HEAD(eeh_eventlist);
static void eeh_thread_launcher(struct work_struct *);
DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
/* Serialize reset sequences for a given pci device */
DEFINE_MUTEX(eeh_event_mutex);
/**
* eeh_event_handler - Dispatch EEH events.
* @dummy - unused
*
* The detection of a frozen slot can occur inside an interrupt,
* where it can be hard to do anything about it. The goal of this
* routine is to pull these detection events out of the context
* of the interrupt handler, and re-dispatch them for processing
* at a later time in a normal context.
*/
static int eeh_event_handler(void * dummy)
{
unsigned long flags;
struct eeh_event *event;
struct eeh_dev *edev;
set_task_comm(current, "eehd");
spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL;
/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL)
return 0;
/* Serialize processing of EEH events */
mutex_lock(&eeh_event_mutex);
edev = event->edev;
eeh_mark_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
eeh_pci_name(edev->pdev));
set_current_state(TASK_INTERRUPTIBLE); /* Don't add to load average */
edev = handle_eeh_events(event);
eeh_clear_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
pci_dev_put(edev->pdev);
kfree(event);
mutex_unlock(&eeh_event_mutex);
/* If there are no new errors after an hour, clear the counter. */
if (edev && edev->freeze_count>0) {
msleep_interruptible(3600*1000);
if (edev->freeze_count>0)
edev->freeze_count--;
}
return 0;
}
/**
* eeh_thread_launcher - Start kernel thread to handle EEH events
* @dummy - unused
*
* This routine is called to start the kernel thread for processing
* EEH event.
*/
static void eeh_thread_launcher(struct work_struct *dummy)
{
if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
printk(KERN_ERR "Failed to start EEH daemon\n");
}
/**
* eeh_send_failure_event - Generate a PCI error event
* @edev: EEH device
*
* This routine can be called within an interrupt context;
* the actual event will be delivered in a normal context
* (from a workqueue).
*/
int eeh_send_failure_event(struct eeh_dev *edev)
{
unsigned long flags;
struct eeh_event *event;
struct device_node *dn = eeh_dev_to_of_node(edev);
const char *location;
if (!mem_init_done) {
printk(KERN_ERR "EEH: event during early boot not handled\n");
location = of_get_property(dn, "ibm,loc-code", NULL);
printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
printk(KERN_ERR "EEH: PCI location = %s\n", location);
return 1;
}
event = kmalloc(sizeof(*event), GFP_ATOMIC);
if (event == NULL) {
printk(KERN_ERR "EEH: out of memory, event not handled\n");
return 1;
}
if (edev->pdev)
pci_dev_get(edev->pdev);
event->edev = edev;
/* We may or may not be called in an interrupt context */
spin_lock_irqsave(&eeh_eventlist_lock, flags);
list_add(&event->list, &eeh_eventlist);
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
schedule_work(&eeh_event_wq);
return 0;
}
@@ -0,0 +1,565 @@
/*
* The file intends to implement the platform dependent EEH operations on pseries.
* Actually, the pseries platform is built based on RTAS heavily. That means the
* pseries platform dependent EEH operations will be built on RTAS calls. The functions
* are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
* been done.
*
* Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
* Copyright IBM Corporation 2001, 2005, 2006
* Copyright Dave Engebretsen & Todd Inglett 2001
* Copyright Linas Vepstas 2005, 2006
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/atomic.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/of.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/rtas.h>
/* RTAS tokens */
static int ibm_set_eeh_option;
static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state;
static int ibm_read_slot_reset_state2;
static int ibm_slot_error_detail;
static int ibm_get_config_addr_info;
static int ibm_get_config_addr_info2;
static int ibm_configure_bridge;
static int ibm_configure_pe;
/*
* Buffer for reporting slot-error-detail rtas calls. Its here
* in BSS, and not dynamically alloced, so that it ends up in
* RMO where RTAS can access it.
*/
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
static int eeh_error_buf_size;
/**
* pseries_eeh_init - EEH platform dependent initialization
*
* EEH platform dependent initialization on pseries.
*/
static int pseries_eeh_init(void)
{
/* figure out EEH RTAS function call tokens */
ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
ibm_configure_pe = rtas_token("ibm,configure-pe");
ibm_configure_bridge = rtas_token ("ibm,configure-bridge");
/* necessary sanity check */
if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n",
__func__);
return -EINVAL;
} else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm, set-slot-reset> invalid\n",
__func__);
return -EINVAL;
} else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and "
"<ibm,read-slot-reset-state> invalid\n",
__func__);
return -EINVAL;
} else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n",
__func__);
return -EINVAL;
} else if (ibm_get_config_addr_info2 == RTAS_UNKNOWN_SERVICE &&
ibm_get_config_addr_info == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm,get-config-addr-info2> and "
"<ibm,get-config-addr-info> invalid\n",
__func__);
return -EINVAL;
} else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE &&
ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: RTAS service <ibm,configure-pe> and "
"<ibm,configure-bridge> invalid\n",
__func__);
return -EINVAL;
}
/* Initialize error log lock and size */
spin_lock_init(&slot_errbuf_lock);
eeh_error_buf_size = rtas_token("rtas-error-log-max");
if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
pr_warning("%s: unknown EEH error log size\n",
__func__);
eeh_error_buf_size = 1024;
} else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
pr_warning("%s: EEH error log size %d exceeds the maximal %d\n",
__func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
}
return 0;
}
/**
* pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
* @dn: device node
* @option: operation to be issued
*
* The function is used to control the EEH functionality globally.
* Currently, following options are support according to PAPR:
* Enable EEH, Disable EEH, Enable MMIO and Enable DMA
*/
static int pseries_eeh_set_option(struct device_node *dn, int option)
{
int ret = 0;
struct eeh_dev *edev;
const u32 *reg;
int config_addr;
edev = of_node_to_eeh_dev(dn);
/*
* When we're enabling or disabling EEH functioality on
* the particular PE, the PE config address is possibly
* unavailable. Therefore, we have to figure it out from
* the FDT node.
*/
switch (option) {
case EEH_OPT_DISABLE:
case EEH_OPT_ENABLE:
reg = of_get_property(dn, "reg", NULL);
config_addr = reg[0];
break;
case EEH_OPT_THAW_MMIO:
case EEH_OPT_THAW_DMA:
config_addr = edev->config_addr;
if (edev->pe_config_addr)
config_addr = edev->pe_config_addr;
break;
default:
pr_err("%s: Invalid option %d\n",
__func__, option);
return -EINVAL;
}
ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), option);
return ret;
}
/**
* pseries_eeh_get_pe_addr - Retrieve PE address
* @dn: device node
*
* Retrieve the assocated PE address. Actually, there're 2 RTAS
* function calls dedicated for the purpose. We need implement
* it through the new function and then the old one. Besides,
* you should make sure the config address is figured out from
* FDT node before calling the function.
*
* It's notable that zero'ed return value means invalid PE config
* address.
*/
static int pseries_eeh_get_pe_addr(struct device_node *dn)
{
struct eeh_dev *edev;
int ret = 0;
int rets[3];
edev = of_node_to_eeh_dev(dn);
if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
/*
* First of all, we need to make sure there has one PE
* associated with the device. Otherwise, PE address is
* meaningless.
*/
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
edev->config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), 1);
if (ret || (rets[0] == 0))
return 0;
/* Retrieve the associated PE config address */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
edev->config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), 0);
if (ret) {
pr_warning("%s: Failed to get PE address for %s\n",
__func__, dn->full_name);
return 0;
}
return rets[0];
}
if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
edev->config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), 0);
if (ret) {
pr_warning("%s: Failed to get PE address for %s\n",
__func__, dn->full_name);
return 0;
}
return rets[0];
}
return ret;
}
/**
* pseries_eeh_get_state - Retrieve PE state
* @dn: PE associated device node
* @state: return value
*
* Retrieve the state of the specified PE. On RTAS compliant
* pseries platform, there already has one dedicated RTAS function
* for the purpose. It's notable that the associated PE config address
* might be ready when calling the function. Therefore, endeavour to
* use the PE config address if possible. Further more, there're 2
* RTAS calls for the purpose, we need to try the new one and back
* to the old one if the new one couldn't work properly.
*/
static int pseries_eeh_get_state(struct device_node *dn, int *state)
{
struct eeh_dev *edev;
int config_addr;
int ret;
int rets[4];
int result;
/* Figure out PE config address if possible */
edev = of_node_to_eeh_dev(dn);
config_addr = edev->config_addr;
if (edev->pe_config_addr)
config_addr = edev->pe_config_addr;
if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid));
} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
/* Fake PE unavailable info */
rets[2] = 0;
ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid));
} else {
return EEH_STATE_NOT_SUPPORT;
}
if (ret)
return ret;
/* Parse the result out */
result = 0;
if (rets[1]) {
switch(rets[0]) {
case 0:
result &= ~EEH_STATE_RESET_ACTIVE;
result |= EEH_STATE_MMIO_ACTIVE;
result |= EEH_STATE_DMA_ACTIVE;
break;
case 1:
result |= EEH_STATE_RESET_ACTIVE;
result |= EEH_STATE_MMIO_ACTIVE;
result |= EEH_STATE_DMA_ACTIVE;
break;
case 2:
result &= ~EEH_STATE_RESET_ACTIVE;
result &= ~EEH_STATE_MMIO_ACTIVE;
result &= ~EEH_STATE_DMA_ACTIVE;
break;
case 4:
result &= ~EEH_STATE_RESET_ACTIVE;
result &= ~EEH_STATE_MMIO_ACTIVE;
result &= ~EEH_STATE_DMA_ACTIVE;
result |= EEH_STATE_MMIO_ENABLED;
break;
case 5:
if (rets[2]) {
if (state) *state = rets[2];
result = EEH_STATE_UNAVAILABLE;
} else {
result = EEH_STATE_NOT_SUPPORT;
}
default:
result = EEH_STATE_NOT_SUPPORT;
}
} else {
result = EEH_STATE_NOT_SUPPORT;
}
return result;
}
/**
* pseries_eeh_reset - Reset the specified PE
* @dn: PE associated device node
* @option: reset option
*
* Reset the specified PE
*/
static int pseries_eeh_reset(struct device_node *dn, int option)
{
struct eeh_dev *edev;
int config_addr;
int ret;
/* Figure out PE address */
edev = of_node_to_eeh_dev(dn);
config_addr = edev->config_addr;
if (edev->pe_config_addr)
config_addr = edev->pe_config_addr;
/* Reset PE through RTAS call */
ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), option);
/* If fundamental-reset not supported, try hot-reset */
if (option == EEH_RESET_FUNDAMENTAL &&
ret == -8) {
ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid), EEH_RESET_HOT);
}
return ret;
}
/**
* pseries_eeh_wait_state - Wait for PE state
* @dn: PE associated device node
* @max_wait: maximal period in microsecond
*
* Wait for the state of associated PE. It might take some time
* to retrieve the PE's state.
*/
static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
{
int ret;
int mwait;
/*
* According to PAPR, the state of PE might be temporarily
* unavailable. Under the circumstance, we have to wait
* for indicated time determined by firmware. The maximal
* wait time is 5 minutes, which is acquired from the original
* EEH implementation. Also, the original implementation
* also defined the minimal wait time as 1 second.
*/
#define EEH_STATE_MIN_WAIT_TIME (1000)
#define EEH_STATE_MAX_WAIT_TIME (300 * 1000)
while (1) {
ret = pseries_eeh_get_state(dn, &mwait);
/*
* If the PE's state is temporarily unavailable,
* we have to wait for the specified time. Otherwise,
* the PE's state will be returned immediately.
*/
if (ret != EEH_STATE_UNAVAILABLE)
return ret;
if (max_wait <= 0) {
pr_warning("%s: Timeout when getting PE's state (%d)\n",
__func__, max_wait);
return EEH_STATE_NOT_SUPPORT;
}
if (mwait <= 0) {
pr_warning("%s: Firmware returned bad wait value %d\n",
__func__, mwait);
mwait = EEH_STATE_MIN_WAIT_TIME;
} else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
pr_warning("%s: Firmware returned too long wait value %d\n",
__func__, mwait);
mwait = EEH_STATE_MAX_WAIT_TIME;
}
max_wait -= mwait;
msleep(mwait);
}
return EEH_STATE_NOT_SUPPORT;
}
/**
* pseries_eeh_get_log - Retrieve error log
* @dn: device node
* @severity: temporary or permanent error log
* @drv_log: driver log to be combined with retrieved error log
* @len: length of driver log
*
* Retrieve the temporary or permanent error from the PE.
* Actually, the error will be retrieved through the dedicated
* RTAS call.
*/
static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
{
struct eeh_dev *edev;
int config_addr;
unsigned long flags;
int ret;
edev = of_node_to_eeh_dev(dn);
spin_lock_irqsave(&slot_errbuf_lock, flags);
memset(slot_errbuf, 0, eeh_error_buf_size);
/* Figure out the PE address */
config_addr = edev->config_addr;
if (edev->pe_config_addr)
config_addr = edev->pe_config_addr;
ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
virt_to_phys(drv_log), len,
virt_to_phys(slot_errbuf), eeh_error_buf_size,
severity);
if (!ret)
log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
spin_unlock_irqrestore(&slot_errbuf_lock, flags);
return ret;
}
/**
* pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
* @dn: PE associated device node
*
* The function will be called to reconfigure the bridges included
* in the specified PE so that the mulfunctional PE would be recovered
* again.
*/
static int pseries_eeh_configure_bridge(struct device_node *dn)
{
struct eeh_dev *edev;
int config_addr;
int ret;
/* Figure out the PE address */
edev = of_node_to_eeh_dev(dn);
config_addr = edev->config_addr;
if (edev->pe_config_addr)
config_addr = edev->pe_config_addr;
/* Use new configure-pe function, if supported */
if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid));
} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
config_addr, BUID_HI(edev->phb->buid),
BUID_LO(edev->phb->buid));
} else {
return -EFAULT;
}
if (ret)
pr_warning("%s: Unable to configure bridge %d for %s\n",
__func__, ret, dn->full_name);
return ret;
}
/**
* pseries_eeh_read_config - Read PCI config space
* @dn: device node
* @where: PCI address
* @size: size to read
* @val: return value
*
* Read config space from the speicifed device
*/
static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val)
{
struct pci_dn *pdn;
pdn = PCI_DN(dn);
return rtas_read_config(pdn, where, size, val);
}
/**
* pseries_eeh_write_config - Write PCI config space
* @dn: device node
* @where: PCI address
* @size: size to write
* @val: value to be written
*
* Write config space to the specified device
*/
static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val)
{
struct pci_dn *pdn;
pdn = PCI_DN(dn);
return rtas_write_config(pdn, where, size, val);
}
static struct eeh_ops pseries_eeh_ops = {
.name = "pseries",
.init = pseries_eeh_init,
.set_option = pseries_eeh_set_option,
.get_pe_addr = pseries_eeh_get_pe_addr,
.get_state = pseries_eeh_get_state,
.reset = pseries_eeh_reset,
.wait_state = pseries_eeh_wait_state,
.get_log = pseries_eeh_get_log,
.configure_bridge = pseries_eeh_configure_bridge,
.read_config = pseries_eeh_read_config,
.write_config = pseries_eeh_write_config
};
/**
* eeh_pseries_init - Register platform dependent EEH operations
*
* EEH initialization on pseries platform. This function should be
* called before any EEH related functions.
*/
int __init eeh_pseries_init(void)
{
return eeh_ops_register(&pseries_eeh_ops);
}
@@ -0,0 +1,84 @@
/*
* Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
* Copyright IBM Corporation 2007
* Copyright Linas Vepstas <linas@austin.ibm.com> 2007
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
*/
#include <linux/pci.h>
#include <linux/stat.h>
#include <asm/ppc-pci.h>
#include <asm/pci-bridge.h>
/**
* EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
* @_name: name of file in sysfs directory
* @_memb: name of member in struct pci_dn to access
* @_format: printf format for display
*
* All of the attributes look very similar, so just
* auto-gen a cut-n-paste routine to display them.
*/
#define EEH_SHOW_ATTR(_name,_memb,_format) \
static ssize_t eeh_show_##_name(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
struct pci_dev *pdev = to_pci_dev(dev); \
struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \
\
if (!edev) \
return 0; \
\
return sprintf(buf, _format "\n", edev->_memb); \
} \
static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
EEH_SHOW_ATTR(eeh_mode, mode, "0x%x");
EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x");
EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x");
EEH_SHOW_ATTR(eeh_check_count, check_count, "%d" );
EEH_SHOW_ATTR(eeh_freeze_count, freeze_count, "%d" );
EEH_SHOW_ATTR(eeh_false_positives, false_positives, "%d" );
void eeh_sysfs_add_device(struct pci_dev *pdev)
{
int rc=0;
rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
if (rc)
printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
}
void eeh_sysfs_remove_device(struct pci_dev *pdev)
{
device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
}
@@ -0,0 +1,86 @@
/*
* Copyright (C) 2001 Dave Engebretsen IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <asm/prom.h>
#include "pseries.h"
void request_event_sources_irqs(struct device_node *np,
irq_handler_t handler,
const char *name)
{
int i, index, count = 0;
struct of_irq oirq;
const u32 *opicprop;
unsigned int opicplen;
unsigned int virqs[16];
/* Check for obsolete "open-pic-interrupt" property. If present, then
* map those interrupts using the default interrupt host and default
* trigger
*/
opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
if (opicprop) {
opicplen /= sizeof(u32);
for (i = 0; i < opicplen; i++) {
if (count > 15)
break;
virqs[count] = irq_create_mapping(NULL, *(opicprop++));
if (virqs[count] == NO_IRQ) {
pr_err("event-sources: Unable to allocate "
"interrupt number for %s\n",
np->full_name);
WARN_ON(1);
}
else
count++;
}
}
/* Else use normal interrupt tree parsing */
else {
/* First try to do a proper OF tree parsing */
for (index = 0; of_irq_map_one(np, index, &oirq) == 0;
index++) {
if (count > 15)
break;
virqs[count] = irq_create_of_mapping(oirq.controller,
oirq.specifier,
oirq.size);
if (virqs[count] == NO_IRQ) {
pr_err("event-sources: Unable to allocate "
"interrupt number for %s\n",
np->full_name);
WARN_ON(1);
}
else
count++;
}
}
/* Now request them */
for (i = 0; i < count; i++) {
if (request_irq(virqs[i], handler, 0, name, NULL)) {
pr_err("event-sources: Unable to request interrupt "
"%d for %s\n", virqs[i], np->full_name);
WARN_ON(1);
return;
}
}
}
@@ -0,0 +1,87 @@
/*
* pSeries firmware setup code.
*
* Portions from arch/powerpc/platforms/pseries/setup.c:
* Copyright (C) 1995 Linus Torvalds
* Adapted from 'alpha' version by Gary Thomas
* Modified by Cort Dougan (cort@cs.nmt.edu)
* Modified by PPC64 Team, IBM Corp
*
* Portions from arch/powerpc/kernel/firmware.c
* Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
* Modifications for ppc64:
* Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
* Copyright (C) 2005 Stephen Rothwell, IBM Corporation
*
* Copyright 2006 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/firmware.h>
#include <asm/prom.h>
#include <asm/udbg.h>
#include "pseries.h"
typedef struct {
unsigned long val;
char * name;
} firmware_feature_t;
static __initdata firmware_feature_t
firmware_features_table[FIRMWARE_MAX_FEATURES] = {
{FW_FEATURE_PFT, "hcall-pft"},
{FW_FEATURE_TCE, "hcall-tce"},
{FW_FEATURE_SPRG0, "hcall-sprg0"},
{FW_FEATURE_DABR, "hcall-dabr"},
{FW_FEATURE_COPY, "hcall-copy"},
{FW_FEATURE_ASR, "hcall-asr"},
{FW_FEATURE_DEBUG, "hcall-debug"},
{FW_FEATURE_PERF, "hcall-perf"},
{FW_FEATURE_DUMP, "hcall-dump"},
{FW_FEATURE_INTERRUPT, "hcall-interrupt"},
{FW_FEATURE_MIGRATE, "hcall-migrate"},
{FW_FEATURE_PERFMON, "hcall-perfmon"},
{FW_FEATURE_CRQ, "hcall-crq"},
{FW_FEATURE_VIO, "hcall-vio"},
{FW_FEATURE_RDMA, "hcall-rdma"},
{FW_FEATURE_LLAN, "hcall-lLAN"},
{FW_FEATURE_BULK_REMOVE, "hcall-bulk"},
{FW_FEATURE_XDABR, "hcall-xdabr"},
{FW_FEATURE_MULTITCE, "hcall-multi-tce"},
{FW_FEATURE_SPLPAR, "hcall-splpar"},
{FW_FEATURE_VPHN, "hcall-vphn"},
};
/* Build up the firmware features bitmask using the contents of
* device-tree/ibm,hypertas-functions. Ultimately this functionality may
* be moved into prom.c prom_init().
*/
void __init fw_feature_init(const char *hypertas, unsigned long len)
{
const char *s;
int i;
pr_debug(" -> fw_feature_init()\n");
for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) {
/* check value against table of strings */
if (!firmware_features_table[i].name ||
strcmp(firmware_features_table[i].name, s))
continue;
/* we have a match */
powerpc_firmware_features |=
firmware_features_table[i].val;
break;
}
}
pr_debug(" <- fw_feature_init()\n");
}
@@ -0,0 +1,414 @@
/*
* pseries CPU Hotplug infrastructure.
*
* Split out from arch/powerpc/platforms/pseries/setup.c
* arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
*
* Peter Bergner, IBM March 2001.
* Copyright (C) 2001 IBM.
* Dave Engebretsen, Peter Bergner, and
* Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
* Plus various changes from other IBM teams...
*
* Copyright (C) 2006 Michael Ellerman, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/sched.h> /* for idle_task_exit */
#include <linux/cpu.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
#include <asm/vdso_datapage.h>
#include <asm/pSeries_reconfig.h>
#include <asm/xics.h>
#include "plpar_wrappers.h"
#include "offline_states.h"
/* This version can't take the spinlock, because it never returns */
static struct rtas_args rtas_stop_self_args = {
.token = RTAS_UNKNOWN_SERVICE,
.nargs = 0,
.nret = 1,
.rets = &rtas_stop_self_args.args[0],
};
static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
CPU_STATE_OFFLINE;
static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
static int cede_offline_enabled __read_mostly = 1;
/*
* Enable/disable cede_offline when available.
*/
static int __init setup_cede_offline(char *str)
{
if (!strcmp(str, "off"))
cede_offline_enabled = 0;
else if (!strcmp(str, "on"))
cede_offline_enabled = 1;
else
return 0;
return 1;
}
__setup("cede_offline=", setup_cede_offline);
enum cpu_state_vals get_cpu_current_state(int cpu)
{
return per_cpu(current_state, cpu);
}
void set_cpu_current_state(int cpu, enum cpu_state_vals state)
{
per_cpu(current_state, cpu) = state;
}
enum cpu_state_vals get_preferred_offline_state(int cpu)
{
return per_cpu(preferred_offline_state, cpu);
}
void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
{
per_cpu(preferred_offline_state, cpu) = state;
}
void set_default_offline_state(int cpu)
{
per_cpu(preferred_offline_state, cpu) = default_offline_state;
}
static void rtas_stop_self(void)
{
struct rtas_args *args = &rtas_stop_self_args;
local_irq_disable();
BUG_ON(args->token == RTAS_UNKNOWN_SERVICE);
printk("cpu %u (hwid %u) Ready to die...\n",
smp_processor_id(), hard_smp_processor_id());
enter_rtas(__pa(args));
panic("Alas, I survived.\n");
}
static void pseries_mach_cpu_die(void)
{
unsigned int cpu = smp_processor_id();
unsigned int hwcpu = hard_smp_processor_id();
u8 cede_latency_hint = 0;
local_irq_disable();
idle_task_exit();
xics_teardown_cpu();
if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
if (ppc_md.suspend_disable_cpu)
ppc_md.suspend_disable_cpu();
cede_latency_hint = 2;
get_lppaca()->idle = 1;
if (!get_lppaca()->shared_proc)
get_lppaca()->donate_dedicated_cpu = 1;
while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
extended_cede_processor(cede_latency_hint);
}
if (!get_lppaca()->shared_proc)
get_lppaca()->donate_dedicated_cpu = 0;
get_lppaca()->idle = 0;
if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
unregister_slb_shadow(hwcpu);
/*
* Call to start_secondary_resume() will not return.
* Kernel stack will be reset and start_secondary()
* will be called to continue the online operation.
*/
start_secondary_resume();
}
}
/* Requested state is CPU_STATE_OFFLINE at this point */
WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
unregister_slb_shadow(hwcpu);
rtas_stop_self();
/* Should never get here... */
BUG();
for(;;);
}
static int pseries_cpu_disable(void)
{
int cpu = smp_processor_id();
set_cpu_online(cpu, false);
vdso_data->processorCount--;
/*fix boot_cpuid here*/
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
/* FIXME: abstract this to not be platform specific later on */
xics_migrate_irqs_away();
return 0;
}
/*
* pseries_cpu_die: Wait for the cpu to die.
* @cpu: logical processor id of the CPU whose death we're awaiting.
*
* This function is called from the context of the thread which is performing
* the cpu-offline. Here we wait for long enough to allow the cpu in question
* to self-destroy so that the cpu-offline thread can send the CPU_DEAD
* notifications.
*
* OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
* self-destruct.
*/
static void pseries_cpu_die(unsigned int cpu)
{
int tries;
int cpu_status = 1;
unsigned int pcpu = get_hard_smp_processor_id(cpu);
if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
cpu_status = 1;
for (tries = 0; tries < 5000; tries++) {
if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
cpu_status = 0;
break;
}
msleep(1);
}
} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
for (tries = 0; tries < 25; tries++) {
cpu_status = smp_query_cpu_stopped(pcpu);
if (cpu_status == QCSS_STOPPED ||
cpu_status == QCSS_HARDWARE_ERROR)
break;
cpu_relax();
}
}
if (cpu_status != 0) {
printk("Querying DEAD? cpu %i (%i) shows %i\n",
cpu, pcpu, cpu_status);
}
/* Isolation and deallocation are definitely done by
* drslot_chrp_cpu. If they were not they would be
* done here. Change isolate state to Isolate and
* change allocation-state to Unusable.
*/
paca[cpu].cpu_start = 0;
}
/*
* Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle
* here is that a cpu device node may represent up to two logical cpus
* in the SMT case. We must honor the assumption in other code that
* the logical ids for sibling SMT threads x and y are adjacent, such
* that x^1 == y and y^1 == x.
*/
static int pseries_add_processor(struct device_node *np)
{
unsigned int cpu;
cpumask_var_t candidate_mask, tmp;
int err = -ENOSPC, len, nthreads, i;
const u32 *intserv;
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return 0;
zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
zalloc_cpumask_var(&tmp, GFP_KERNEL);
nthreads = len / sizeof(u32);
for (i = 0; i < nthreads; i++)
cpumask_set_cpu(i, tmp);
cpu_maps_update_begin();
BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
/* Get a bitmap of unoccupied slots. */
cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
if (cpumask_empty(candidate_mask)) {
/* If we get here, it most likely means that NR_CPUS is
* less than the partition's max processors setting.
*/
printk(KERN_ERR "Cannot add cpu %s; this system configuration"
" supports %d logical cpus.\n", np->full_name,
cpumask_weight(cpu_possible_mask));
goto out_unlock;
}
while (!cpumask_empty(tmp))
if (cpumask_subset(tmp, candidate_mask))
/* Found a range where we can insert the new cpu(s) */
break;
else
cpumask_shift_left(tmp, tmp, nthreads);
if (cpumask_empty(tmp)) {
printk(KERN_ERR "Unable to find space in cpu_present_mask for"
" processor %s with %d thread(s)\n", np->name,
nthreads);
goto out_unlock;
}
for_each_cpu(cpu, tmp) {
BUG_ON(cpu_present(cpu));
set_cpu_present(cpu, true);
set_hard_smp_processor_id(cpu, *intserv++);
}
err = 0;
out_unlock:
cpu_maps_update_done();
free_cpumask_var(candidate_mask);
free_cpumask_var(tmp);
return err;
}
/*
* Update the present map for a cpu node which is going away, and set
* the hard id in the paca(s) to -1 to be consistent with boot time
* convention for non-present cpus.
*/
static void pseries_remove_processor(struct device_node *np)
{
unsigned int cpu;
int len, nthreads, i;
const u32 *intserv;
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != intserv[i])
continue;
BUG_ON(cpu_online(cpu));
set_cpu_present(cpu, false);
set_hard_smp_processor_id(cpu, -1);
break;
}
if (cpu >= nr_cpu_ids)
printk(KERN_WARNING "Could not find cpu to remove "
"with physical id 0x%x\n", intserv[i]);
}
cpu_maps_update_done();
}
static int pseries_smp_notifier(struct notifier_block *nb,
unsigned long action, void *node)
{
int err = 0;
switch (action) {
case PSERIES_RECONFIG_ADD:
err = pseries_add_processor(node);
break;
case PSERIES_RECONFIG_REMOVE:
pseries_remove_processor(node);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block pseries_smp_nb = {
.notifier_call = pseries_smp_notifier,
};
#define MAX_CEDE_LATENCY_LEVELS 4
#define CEDE_LATENCY_PARAM_LENGTH 10
#define CEDE_LATENCY_PARAM_MAX_LENGTH \
(MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
#define CEDE_LATENCY_TOKEN 45
static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
static int parse_cede_parameters(void)
{
memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
CEDE_LATENCY_TOKEN,
__pa(cede_parameters),
CEDE_LATENCY_PARAM_MAX_LENGTH);
}
static int __init pseries_cpu_hotplug_init(void)
{
struct device_node *np;
const char *typep;
int cpu;
int qcss_tok;
for_each_node_by_name(np, "interrupt-controller") {
typep = of_get_property(np, "compatible", NULL);
if (strstr(typep, "open-pic")) {
of_node_put(np);
printk(KERN_INFO "CPU Hotplug not supported on "
"systems using MPIC\n");
return 0;
}
}
rtas_stop_self_args.token = rtas_token("stop-self");
qcss_tok = rtas_token("query-cpu-stopped-state");
if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE ||
qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "CPU Hotplug not supported by firmware "
"- disabling.\n");
return 0;
}
ppc_md.cpu_die = pseries_mach_cpu_die;
smp_ops->cpu_disable = pseries_cpu_disable;
smp_ops->cpu_die = pseries_cpu_die;
/* Processors can be added/removed only on LPAR */
if (firmware_has_feature(FW_FEATURE_LPAR)) {
pSeries_reconfig_notifier_register(&pseries_smp_nb);
cpu_maps_update_begin();
if (cede_offline_enabled && parse_cede_parameters() == 0) {
default_offline_state = CPU_STATE_INACTIVE;
for_each_online_cpu(cpu)
set_default_offline_state(cpu);
}
cpu_maps_update_done();
}
return 0;
}
arch_initcall(pseries_cpu_hotplug_init);
@@ -0,0 +1,236 @@
/*
* pseries Memory Hotplug infrastructure.
*
* Copyright (C) 2008 Badari Pulavarty, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/of.h>
#include <linux/memblock.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
#include <asm/pSeries_reconfig.h>
#include <asm/sparsemem.h>
static unsigned long get_memblock_size(void)
{
struct device_node *np;
unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
struct resource r;
np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (np) {
const __be64 *size;
size = of_get_property(np, "ibm,lmb-size", NULL);
if (size)
memblock_size = be64_to_cpup(size);
of_node_put(np);
} else if (machine_is(pseries)) {
/* This fallback really only applies to pseries */
unsigned int memzero_size = 0;
np = of_find_node_by_path("/memory@0");
if (np) {
if (!of_address_to_resource(np, 0, &r))
memzero_size = resource_size(&r);
of_node_put(np);
}
if (memzero_size) {
/* We now know the size of memory@0, use this to find
* the first memoryblock and get its size.
*/
char buf[64];
sprintf(buf, "/memory@%x", memzero_size);
np = of_find_node_by_path(buf);
if (np) {
if (!of_address_to_resource(np, 0, &r))
memblock_size = resource_size(&r);
of_node_put(np);
}
}
}
return memblock_size;
}
/* WARNING: This is going to override the generic definition whenever
* pseries is built-in regardless of what platform is active at boot
* time. This is fine for now as this is the only "option" and it
* should work everywhere. If not, we'll have to turn this into a
* ppc_md. callback
*/
unsigned long memory_block_size_bytes(void)
{
return get_memblock_size();
}
static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
{
unsigned long start, start_pfn;
struct zone *zone;
int ret;
start_pfn = base >> PAGE_SHIFT;
if (!pfn_valid(start_pfn)) {
memblock_remove(base, memblock_size);
return 0;
}
zone = page_zone(pfn_to_page(start_pfn));
/*
* Remove section mappings and sysfs entries for the
* section of the memory we are removing.
*
* NOTE: Ideally, this should be done in generic code like
* remove_memory(). But remove_memory() gets called by writing
* to sysfs "state" file and we can't remove sysfs entries
* while writing to it. So we have to defer it to here.
*/
ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
if (ret)
return ret;
/*
* Update memory regions for memory remove
*/
memblock_remove(base, memblock_size);
/*
* Remove htab bolted mappings for this section of memory
*/
start = (unsigned long)__va(base);
ret = remove_section_mapping(start, start + memblock_size);
/* Ensure all vmalloc mappings are flushed in case they also
* hit that section of memory
*/
vm_unmap_aliases();
return ret;
}
static int pseries_remove_memory(struct device_node *np)
{
const char *type;
const unsigned int *regs;
unsigned long base;
unsigned int lmb_size;
int ret = -EINVAL;
/*
* Check to see if we are actually removing memory
*/
type = of_get_property(np, "device_type", NULL);
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
/*
* Find the bae address and size of the memblock
*/
regs = of_get_property(np, "reg", NULL);
if (!regs)
return ret;
base = *(unsigned long *)regs;
lmb_size = regs[3];
ret = pseries_remove_memblock(base, lmb_size);
return ret;
}
static int pseries_add_memory(struct device_node *np)
{
const char *type;
const unsigned int *regs;
unsigned long base;
unsigned int lmb_size;
int ret = -EINVAL;
/*
* Check to see if we are actually adding memory
*/
type = of_get_property(np, "device_type", NULL);
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
/*
* Find the base and size of the memblock
*/
regs = of_get_property(np, "reg", NULL);
if (!regs)
return ret;
base = *(unsigned long *)regs;
lmb_size = regs[3];
/*
* Update memory region to represent the memory add
*/
ret = memblock_add(base, lmb_size);
return (ret < 0) ? -EINVAL : 0;
}
static int pseries_drconf_memory(unsigned long *base, unsigned int action)
{
unsigned long memblock_size;
int rc;
memblock_size = get_memblock_size();
if (!memblock_size)
return -EINVAL;
if (action == PSERIES_DRCONF_MEM_ADD) {
rc = memblock_add(*base, memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
rc = pseries_remove_memblock(*base, memblock_size);
} else {
rc = -EINVAL;
}
return rc;
}
static int pseries_memory_notifier(struct notifier_block *nb,
unsigned long action, void *node)
{
int err = 0;
switch (action) {
case PSERIES_RECONFIG_ADD:
err = pseries_add_memory(node);
break;
case PSERIES_RECONFIG_REMOVE:
err = pseries_remove_memory(node);
break;
case PSERIES_DRCONF_MEM_ADD:
case PSERIES_DRCONF_MEM_REMOVE:
err = pseries_drconf_memory(node, action);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block pseries_mem_nb = {
.notifier_call = pseries_memory_notifier,
};
static int __init pseries_memory_hotplug_init(void)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
pSeries_reconfig_notifier_register(&pseries_mem_nb);
return 0;
}
machine_device_initcall(pseries, pseries_memory_hotplug_init);
@@ -0,0 +1,270 @@
/*
* This file contains the generic code to perform a call to the
* pSeries LPAR hypervisor.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/hvcall.h>
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#define STK_PARM(i) (48 + ((i)-3)*8)
#ifdef CONFIG_TRACEPOINTS
.section ".toc","aw"
.globl hcall_tracepoint_refcount
hcall_tracepoint_refcount:
.llong 0
.section ".text"
/*
* precall must preserve all registers. use unused STK_PARM()
* areas to save snapshots and opcode. We branch around this
* in early init (eg when populating the MMU hashtable) by using an
* unconditional cpu feature.
*/
#define HCALL_INST_PRECALL(FIRST_REG) \
BEGIN_FTR_SECTION; \
b 1f; \
END_FTR_SECTION(0, 1); \
ld r12,hcall_tracepoint_refcount@toc(r2); \
std r12,32(r1); \
cmpdi r12,0; \
beq+ 1f; \
mflr r0; \
std r3,STK_PARM(r3)(r1); \
std r4,STK_PARM(r4)(r1); \
std r5,STK_PARM(r5)(r1); \
std r6,STK_PARM(r6)(r1); \
std r7,STK_PARM(r7)(r1); \
std r8,STK_PARM(r8)(r1); \
std r9,STK_PARM(r9)(r1); \
std r10,STK_PARM(r10)(r1); \
std r0,16(r1); \
addi r4,r1,STK_PARM(FIRST_REG); \
stdu r1,-STACK_FRAME_OVERHEAD(r1); \
bl .__trace_hcall_entry; \
addi r1,r1,STACK_FRAME_OVERHEAD; \
ld r0,16(r1); \
ld r3,STK_PARM(r3)(r1); \
ld r4,STK_PARM(r4)(r1); \
ld r5,STK_PARM(r5)(r1); \
ld r6,STK_PARM(r6)(r1); \
ld r7,STK_PARM(r7)(r1); \
ld r8,STK_PARM(r8)(r1); \
ld r9,STK_PARM(r9)(r1); \
ld r10,STK_PARM(r10)(r1); \
mtlr r0; \
1:
/*
* postcall is performed immediately before function return which
* allows liberal use of volatile registers. We branch around this
* in early init (eg when populating the MMU hashtable) by using an
* unconditional cpu feature.
*/
#define __HCALL_INST_POSTCALL \
BEGIN_FTR_SECTION; \
b 1f; \
END_FTR_SECTION(0, 1); \
ld r12,32(r1); \
cmpdi r12,0; \
beq+ 1f; \
mflr r0; \
ld r6,STK_PARM(r3)(r1); \
std r3,STK_PARM(r3)(r1); \
mr r4,r3; \
mr r3,r6; \
std r0,16(r1); \
stdu r1,-STACK_FRAME_OVERHEAD(r1); \
bl .__trace_hcall_exit; \
addi r1,r1,STACK_FRAME_OVERHEAD; \
ld r0,16(r1); \
ld r3,STK_PARM(r3)(r1); \
mtlr r0; \
1:
#define HCALL_INST_POSTCALL_NORETS \
li r5,0; \
__HCALL_INST_POSTCALL
#define HCALL_INST_POSTCALL(BUFREG) \
mr r5,BUFREG; \
__HCALL_INST_POSTCALL
#else
#define HCALL_INST_PRECALL(FIRST_ARG)
#define HCALL_INST_POSTCALL_NORETS
#define HCALL_INST_POSTCALL(BUFREG)
#endif
.text
_GLOBAL(plpar_hcall_norets)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_INST_PRECALL(r4)
HVSC /* invoke the hypervisor */
HCALL_INST_POSTCALL_NORETS
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
_GLOBAL(plpar_hcall)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_INST_PRECALL(r5)
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
HVSC /* invoke the hypervisor */
ld r12,STK_PARM(r4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
HCALL_INST_POSTCALL(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
/*
* plpar_hcall_raw can be called in real mode. kexec/kdump need some
* hypervisor calls to be executed in real mode. So plpar_hcall_raw
* does not access the per cpu hypervisor call statistics variables,
* since these variables may not be present in the RMO region.
*/
_GLOBAL(plpar_hcall_raw)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
HVSC /* invoke the hypervisor */
ld r12,STK_PARM(r4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
_GLOBAL(plpar_hcall9)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_INST_PRECALL(r5)
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */
ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */
ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */
HVSC /* invoke the hypervisor */
mr r0,r12
ld r12,STK_PARM(r4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
std r8, 32(r12)
std r9, 40(r12)
std r10,48(r12)
std r11,56(r12)
std r0, 64(r12)
HCALL_INST_POSTCALL(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
/* See plpar_hcall_raw to see why this is needed */
_GLOBAL(plpar_hcall9_raw)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */
ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */
ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */
HVSC /* invoke the hypervisor */
mr r0,r12
ld r12,STK_PARM(r4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
std r8, 32(r12)
std r9, 40(r12)
std r10,48(r12)
std r11,56(r12)
std r0, 64(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
@@ -0,0 +1,165 @@
/*
* Copyright (C) 2006 Mike Kravetz IBM Corporation
*
* Hypervisor Call Instrumentation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <asm/hvcall.h>
#include <asm/firmware.h>
#include <asm/cputable.h>
#include <asm/trace.h>
DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
/*
* Routines for displaying the statistics in debugfs
*/
static void *hc_start(struct seq_file *m, loff_t *pos)
{
if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
return (void *)(unsigned long)(*pos + 1);
return NULL;
}
static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
{
++*pos;
return hc_start(m, pos);
}
static void hc_stop(struct seq_file *m, void *p)
{
}
static int hc_show(struct seq_file *m, void *p)
{
unsigned long h_num = (unsigned long)p;
struct hcall_stats *hs = m->private;
if (hs[h_num].num_calls) {
if (cpu_has_feature(CPU_FTR_PURR))
seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
hs[h_num].num_calls,
hs[h_num].tb_total,
hs[h_num].purr_total);
else
seq_printf(m, "%lu %lu %lu\n", h_num<<2,
hs[h_num].num_calls,
hs[h_num].tb_total);
}
return 0;
}
static const struct seq_operations hcall_inst_seq_ops = {
.start = hc_start,
.next = hc_next,
.stop = hc_stop,
.show = hc_show
};
static int hcall_inst_seq_open(struct inode *inode, struct file *file)
{
int rc;
struct seq_file *seq;
rc = seq_open(file, &hcall_inst_seq_ops);
seq = file->private_data;
seq->private = file->f_path.dentry->d_inode->i_private;
return rc;
}
static const struct file_operations hcall_inst_seq_fops = {
.open = hcall_inst_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#define HCALL_ROOT_DIR "hcall_inst"
#define CPU_NAME_BUF_SIZE 32
static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
{
struct hcall_stats *h;
if (opcode > MAX_HCALL_OPCODE)
return;
h = &__get_cpu_var(hcall_stats)[opcode / 4];
h->tb_start = mftb();
h->purr_start = mfspr(SPRN_PURR);
}
static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval,
unsigned long *retbuf)
{
struct hcall_stats *h;
if (opcode > MAX_HCALL_OPCODE)
return;
h = &__get_cpu_var(hcall_stats)[opcode / 4];
h->num_calls++;
h->tb_total += mftb() - h->tb_start;
h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
}
static int __init hcall_inst_init(void)
{
struct dentry *hcall_root;
struct dentry *hcall_file;
char cpu_name_buf[CPU_NAME_BUF_SIZE];
int cpu;
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
if (register_trace_hcall_entry(probe_hcall_entry, NULL))
return -EINVAL;
if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
unregister_trace_hcall_entry(probe_hcall_entry, NULL);
return -EINVAL;
}
hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
if (!hcall_root)
return -ENOMEM;
for_each_possible_cpu(cpu) {
snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
hcall_file = debugfs_create_file(cpu_name_buf, S_IRUGO,
hcall_root,
per_cpu(hcall_stats, cpu),
&hcall_inst_seq_fops);
if (!hcall_file)
return -ENOMEM;
}
return 0;
}
__initcall(hcall_inst_init);
@@ -0,0 +1,81 @@
/*
* hvconsole.c
* Copyright (C) 2004 Hollis Blanchard, IBM Corporation
* Copyright (C) 2004 IBM Corporation
*
* Additional Author(s):
* Ryan S. Arnold <rsa@us.ibm.com>
*
* LPAR console support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/errno.h>
#include <asm/hvcall.h>
#include <asm/hvconsole.h>
#include "plpar_wrappers.h"
/**
* hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
* @vtermno: The vtermno or unit_address of the adapter from which to fetch the
* data.
* @buf: The character buffer into which to put the character data fetched from
* firmware.
* @count: not used?
*/
int hvc_get_chars(uint32_t vtermno, char *buf, int count)
{
unsigned long got;
if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS)
return got;
return 0;
}
EXPORT_SYMBOL(hvc_get_chars);
/**
* hvc_put_chars: send characters to firmware for denoted vterm adapter
* @vtermno: The vtermno or unit_address of the adapter from which the data
* originated.
* @buf: The character buffer that contains the character data to send to
* firmware.
* @count: Send this number of characters.
*/
int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
{
unsigned long *lbuf = (unsigned long *) buf;
long ret;
/* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
if (count > MAX_VIO_PUT_CHARS)
count = MAX_VIO_PUT_CHARS;
ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
lbuf[1]);
if (ret == H_SUCCESS)
return count;
if (ret == H_BUSY)
return -EAGAIN;
return -EIO;
}
EXPORT_SYMBOL(hvc_put_chars);
@@ -0,0 +1,251 @@
/*
* hvcserver.c
* Copyright (C) 2004 Ryan S Arnold, IBM Corporation
*
* PPC64 virtual I/O console server support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/hvcall.h>
#include <asm/hvcserver.h>
#include <asm/io.h>
#define HVCS_ARCH_VERSION "1.0.0"
MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
MODULE_DESCRIPTION("IBM hvcs ppc64 API");
MODULE_LICENSE("GPL");
MODULE_VERSION(HVCS_ARCH_VERSION);
/*
* Convert arch specific return codes into relevant errnos. The hvcs
* functions aren't performance sensitive, so this conversion isn't an
* issue.
*/
static int hvcs_convert(long to_convert)
{
switch (to_convert) {
case H_SUCCESS:
return 0;
case H_PARAMETER:
return -EINVAL;
case H_HARDWARE:
return -EIO;
case H_BUSY:
case H_LONG_BUSY_ORDER_1_MSEC:
case H_LONG_BUSY_ORDER_10_MSEC:
case H_LONG_BUSY_ORDER_100_MSEC:
case H_LONG_BUSY_ORDER_1_SEC:
case H_LONG_BUSY_ORDER_10_SEC:
case H_LONG_BUSY_ORDER_100_SEC:
return -EBUSY;
case H_FUNCTION: /* fall through */
default:
return -EPERM;
}
}
/**
* hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
* @head: list_head pointer for an allocated list of partner info structs to
* free.
*
* This function is used to free the partner info list that was returned by
* calling hvcs_get_partner_info().
*/
int hvcs_free_partner_info(struct list_head *head)
{
struct hvcs_partner_info *pi;
struct list_head *element;
if (!head)
return -EINVAL;
while (!list_empty(head)) {
element = head->next;
pi = list_entry(element, struct hvcs_partner_info, node);
list_del(element);
kfree(pi);
}
return 0;
}
EXPORT_SYMBOL(hvcs_free_partner_info);
/* Helper function for hvcs_get_partner_info */
static int hvcs_next_partner(uint32_t unit_address,
unsigned long last_p_partition_ID,
unsigned long last_p_unit_address, unsigned long *pi_buff)
{
long retval;
retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
last_p_partition_ID,
last_p_unit_address, virt_to_phys(pi_buff));
return hvcs_convert(retval);
}
/**
* hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
* @unit_address: The unit_address of the vty-server adapter for which this
* function is fetching partner info.
* @head: An initialized list_head pointer to an empty list to use to return the
* list of partner info fetched from the hypervisor to the caller.
* @pi_buff: A page sized buffer pre-allocated prior to calling this function
* that is to be used to be used by firmware as an iterator to keep track
* of the partner info retrieval.
*
* This function returns non-zero on success, or if there is no partner info.
*
* The pi_buff is pre-allocated prior to calling this function because this
* function may be called with a spin_lock held and kmalloc of a page is not
* recommended as GFP_ATOMIC.
*
* The first long of this buffer is used to store a partner unit address. The
* second long is used to store a partner partition ID and starting at
* pi_buff[2] is the 79 character Converged Location Code (diff size than the
* unsigned longs, hence the casting mumbo jumbo you see later).
*
* Invocation of this function should always be followed by an invocation of
* hvcs_free_partner_info() using a pointer to the SAME list head instance
* that was passed as a parameter to this function.
*/
int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
unsigned long *pi_buff)
{
/*
* Dealt with as longs because of the hcall interface even though the
* values are uint32_t.
*/
unsigned long last_p_partition_ID;
unsigned long last_p_unit_address;
struct hvcs_partner_info *next_partner_info = NULL;
int more = 1;
int retval;
memset(pi_buff, 0x00, PAGE_SIZE);
/* invalid parameters */
if (!head || !pi_buff)
return -EINVAL;
last_p_partition_ID = last_p_unit_address = ~0UL;
INIT_LIST_HEAD(head);
do {
retval = hvcs_next_partner(unit_address, last_p_partition_ID,
last_p_unit_address, pi_buff);
if (retval) {
/*
* Don't indicate that we've failed if we have
* any list elements.
*/
if (!list_empty(head))
return 0;
return retval;
}
last_p_partition_ID = pi_buff[0];
last_p_unit_address = pi_buff[1];
/* This indicates that there are no further partners */
if (last_p_partition_ID == ~0UL
&& last_p_unit_address == ~0UL)
break;
/* This is a very small struct and will be freed soon in
* hvcs_free_partner_info(). */
next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
GFP_ATOMIC);
if (!next_partner_info) {
printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
" allocate partner info struct.\n");
hvcs_free_partner_info(head);
return -ENOMEM;
}
next_partner_info->unit_address
= (unsigned int)last_p_unit_address;
next_partner_info->partition_ID
= (unsigned int)last_p_partition_ID;
/* copy the Null-term char too */
strncpy(&next_partner_info->location_code[0],
(char *)&pi_buff[2],
strlen((char *)&pi_buff[2]) + 1);
list_add_tail(&(next_partner_info->node), head);
next_partner_info = NULL;
} while (more);
return 0;
}
EXPORT_SYMBOL(hvcs_get_partner_info);
/**
* hvcs_register_connection - establish a connection between this vty-server and
* a vty.
* @unit_address: The unit address of the vty-server adapter that is to be
* establish a connection.
* @p_partition_ID: The partition ID of the vty adapter that is to be connected.
* @p_unit_address: The unit address of the vty adapter to which the vty-server
* is to be connected.
*
* If this function is called once and -EINVAL is returned it may
* indicate that the partner info needs to be refreshed for the
* target unit address at which point the caller must invoke
* hvcs_get_partner_info() and then call this function again. If,
* for a second time, -EINVAL is returned then it indicates that
* there is probably already a partner connection registered to a
* different vty-server adapter. It is also possible that a second
* -EINVAL may indicate that one of the parms is not valid, for
* instance if the link was removed between the vty-server adapter
* and the vty adapter that you are trying to open. Don't shoot the
* messenger. Firmware implemented it this way.
*/
int hvcs_register_connection( uint32_t unit_address,
uint32_t p_partition_ID, uint32_t p_unit_address)
{
long retval;
retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
p_partition_ID, p_unit_address);
return hvcs_convert(retval);
}
EXPORT_SYMBOL(hvcs_register_connection);
/**
* hvcs_free_connection - free the connection between a vty-server and vty
* @unit_address: The unit address of the vty-server that is to have its
* connection severed.
*
* This function is used to free the partner connection between a vty-server
* adapter and a vty adapter.
*
* If -EBUSY is returned continue to call this function until 0 is returned.
*/
int hvcs_free_connection(uint32_t unit_address)
{
long retval;
retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
return hvcs_convert(retval);
}
EXPORT_SYMBOL(hvcs_free_connection);
@@ -0,0 +1,165 @@
/*
* Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/irq.h>
#include <asm/io_event_irq.h>
#include "pseries.h"
/*
* IO event interrupt is a mechanism provided by RTAS to return
* information about hardware error and non-error events. Device
* drivers can register their event handlers to receive events.
* Device drivers are expected to use atomic_notifier_chain_register()
* and atomic_notifier_chain_unregister() to register and unregister
* their event handlers. Since multiple IO event types and scopes
* share an IO event interrupt, the event handlers are called one
* by one until the IO event is claimed by one of the handlers.
* The event handlers are expected to return NOTIFY_OK if the
* event is handled by the event handler or NOTIFY_DONE if the
* event does not belong to the handler.
*
* Usage:
*
* Notifier function:
* #include <asm/io_event_irq.h>
* int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
* p = (struct pseries_io_event_sect_data *) data;
* if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
* :
* :
* return NOTIFY_OK;
* }
* struct notifier_block event_nb = {
* .notifier_call = event_handler,
* }
*
* Registration:
* atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
*
* Unregistration:
* atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
*/
ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
static int ioei_check_exception_token;
static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
/**
* Find the data portion of an IO Event section from event log.
* @elog: RTAS error/event log.
*
* Return:
* pointer to a valid IO event section data. NULL if not found.
*/
static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
{
struct pseries_errorlog *sect;
/* We should only ever get called for io-event interrupts, but if
* we do get called for another type then something went wrong so
* make some noise about it.
* RTAS_TYPE_IO only exists in extended event log version 6 or later.
* No need to check event log version.
*/
if (unlikely(elog->type != RTAS_TYPE_IO)) {
printk_once(KERN_WARNING "io_event_irq: Unexpected event type %d",
elog->type);
return NULL;
}
sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
if (unlikely(!sect)) {
printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
"log does not contain an IO Event section. "
"Could be a bug in system firmware!\n");
return NULL;
}
return (struct pseries_io_event *) &sect->data;
}
/*
* PAPR:
* - check-exception returns the first found error or event and clear that
* error or event so it is reported once.
* - Each interrupt returns one event. If a plateform chooses to report
* multiple events through a single interrupt, it must ensure that the
* interrupt remains asserted until check-exception has been used to
* process all out-standing events for that interrupt.
*
* Implementation notes:
* - Events must be processed in the order they are returned. Hence,
* sequential in nature.
* - The owner of an event is determined by combinations of scope,
* event type, and sub-type. There is no easy way to pre-sort clients
* by scope or event type alone. For example, Torrent ISR route change
* event is reported with scope 0x00 (Not Applicatable) rather than
* 0x3B (Torrent-hub). It is better to let the clients to identify
* who owns the the event.
*/
static irqreturn_t ioei_interrupt(int irq, void *dev_id)
{
struct pseries_io_event *event;
int rtas_rc;
for (;;) {
rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_IO_EVENTS, 1 /* Time Critical */,
__pa(ioei_rtas_buf),
RTAS_DATA_BUF_SIZE);
if (rtas_rc != 0)
break;
event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
if (!event)
continue;
atomic_notifier_call_chain(&pseries_ioei_notifier_list,
0, event);
}
return IRQ_HANDLED;
}
static int __init ioei_init(void)
{
struct device_node *np;
ioei_check_exception_token = rtas_token("check-exception");
if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
np = of_find_node_by_path("/event-sources/ibm,io-events");
if (np) {
request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
pr_info("IBM I/O event interrupts enabled\n");
of_node_put(np);
} else {
return -ENODEV;
}
return 0;
}
machine_subsys_initcall(pseries, ioei_init);
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,76 @@
/*
* Copyright 2006 Michael Ellerman, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <asm/machdep.h>
#include <asm/page.h>
#include <asm/firmware.h>
#include <asm/kexec.h>
#include <asm/mpic.h>
#include <asm/xics.h>
#include <asm/smp.h>
#include "pseries.h"
#include "plpar_wrappers.h"
static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
{
/* Don't risk a hypervisor call if we're crashing */
if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
int ret;
int cpu = smp_processor_id();
int hwcpu = hard_smp_processor_id();
if (get_lppaca()->dtl_enable_mask) {
ret = unregister_dtl(hwcpu);
if (ret) {
pr_err("WARNING: DTL deregistration for cpu "
"%d (hw %d) failed with %d\n",
cpu, hwcpu, ret);
}
}
ret = unregister_slb_shadow(hwcpu);
if (ret) {
pr_err("WARNING: SLB shadow buffer deregistration "
"for cpu %d (hw %d) failed with %d\n",
cpu, hwcpu, ret);
}
ret = unregister_vpa(hwcpu);
if (ret) {
pr_err("WARNING: VPA deregistration for cpu %d "
"(hw %d) failed with %d\n", cpu, hwcpu, ret);
}
}
}
static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary)
{
pseries_kexec_cpu_down(crash_shutdown, secondary);
mpic_teardown_this_cpu(secondary);
}
void __init setup_kexec_cpu_down_mpic(void)
{
ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic;
}
static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary)
{
pseries_kexec_cpu_down(crash_shutdown, secondary);
xics_kexec_teardown_cpu(secondary);
}
void __init setup_kexec_cpu_down_xics(void)
{
ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
}
@@ -0,0 +1,641 @@
/*
* pSeries_lpar.c
* Copyright (C) 2001 Todd Inglett, IBM Corporation
*
* pSeries LPAR support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* Enables debugging of low-level hash table routines - careful! */
#undef DEBUG
#include <linux/kernel.h>
#include <linux/dma-mapping.h>
#include <linux/console.h>
#include <linux/export.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/machdep.h>
#include <asm/abs_addr.h>
#include <asm/mmu_context.h>
#include <asm/iommu.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/prom.h>
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/trace.h>
#include <asm/firmware.h>
#include "plpar_wrappers.h"
#include "pseries.h"
/* in hvCall.S */
EXPORT_SYMBOL(plpar_hcall);
EXPORT_SYMBOL(plpar_hcall9);
EXPORT_SYMBOL(plpar_hcall_norets);
extern void pSeries_find_serial_port(void);
void vpa_init(int cpu)
{
int hwcpu = get_hard_smp_processor_id(cpu);
unsigned long addr;
long ret;
struct paca_struct *pp;
struct dtl_entry *dtl;
if (cpu_has_feature(CPU_FTR_ALTIVEC))
lppaca_of(cpu).vmxregs_in_use = 1;
addr = __pa(&lppaca_of(cpu));
ret = register_vpa(hwcpu, addr);
if (ret) {
pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
"%lx failed with %ld\n", cpu, hwcpu, addr, ret);
return;
}
/*
* PAPR says this feature is SLB-Buffer but firmware never
* reports that. All SPLPAR support SLB shadow buffer.
*/
addr = __pa(&slb_shadow[cpu]);
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
ret = register_slb_shadow(hwcpu, addr);
if (ret)
pr_err("WARNING: SLB shadow buffer registration for "
"cpu %d (hw %d) of area %lx failed with %ld\n",
cpu, hwcpu, addr, ret);
}
/*
* Register dispatch trace log, if one has been allocated.
*/
pp = &paca[cpu];
dtl = pp->dispatch_log;
if (dtl) {
pp->dtl_ridx = 0;
pp->dtl_curr = dtl;
lppaca_of(cpu).dtl_idx = 0;
/* hypervisor reads buffer length from this field */
dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
ret = register_dtl(hwcpu, __pa(dtl));
if (ret)
pr_err("WARNING: DTL registration of cpu %d (hw %d) "
"failed with %ld\n", smp_processor_id(),
hwcpu, ret);
lppaca_of(cpu).dtl_enable_mask = 2;
}
}
static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
unsigned long va, unsigned long pa,
unsigned long rflags, unsigned long vflags,
int psize, int ssize)
{
unsigned long lpar_rc;
unsigned long flags;
unsigned long slot;
unsigned long hpte_v, hpte_r;
if (!(vflags & HPTE_V_BOLTED))
pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
"rflags=%lx, vflags=%lx, psize=%d)\n",
hpte_group, va, pa, rflags, vflags, psize);
hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
/* Now fill in the actual HPTE */
/* Set CEC cookie to 0 */
/* Zero page = 0 */
/* I-cache Invalidate = 0 */
/* I-cache synchronize = 0 */
/* Exact = 0 */
flags = 0;
/* Make pHyp happy */
if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU))
hpte_r &= ~_PAGE_COHERENT;
if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
flags |= H_COALESCE_CAND;
lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
if (unlikely(lpar_rc == H_PTEG_FULL)) {
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" full\n");
return -1;
}
/*
* Since we try and ioremap PHBs we don't own, the pte insert
* will fail. However we must catch the failure in hash_page
* or we will loop forever, so return -2 in this case.
*/
if (unlikely(lpar_rc != H_SUCCESS)) {
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" lpar err %lu\n", lpar_rc);
return -2;
}
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" -> slot: %lu\n", slot & 7);
/* Because of iSeries, we have to pass down the secondary
* bucket bit here as well
*/
return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
{
unsigned long slot_offset;
unsigned long lpar_rc;
int i;
unsigned long dummy1, dummy2;
/* pick a random slot to start at */
slot_offset = mftb() & 0x7;
for (i = 0; i < HPTES_PER_GROUP; i++) {
/* don't remove a bolted entry */
lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
(0x1UL << 4), &dummy1, &dummy2);
if (lpar_rc == H_SUCCESS)
return i;
BUG_ON(lpar_rc != H_NOT_FOUND);
slot_offset++;
slot_offset &= 0x7;
}
return -1;
}
static void pSeries_lpar_hptab_clear(void)
{
unsigned long size_bytes = 1UL << ppc64_pft_size;
unsigned long hpte_count = size_bytes >> 4;
struct {
unsigned long pteh;
unsigned long ptel;
} ptes[4];
long lpar_rc;
unsigned long i, j;
/* Read in batches of 4,
* invalidate only valid entries not in the VRMA
* hpte_count will be a multiple of 4
*/
for (i = 0; i < hpte_count; i += 4) {
lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
if (lpar_rc != H_SUCCESS)
continue;
for (j = 0; j < 4; j++){
if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
HPTE_V_VRMA_MASK)
continue;
if (ptes[j].pteh & HPTE_V_VALID)
plpar_pte_remove_raw(0, i + j, 0,
&(ptes[j].pteh), &(ptes[j].ptel));
}
}
}
/*
* This computes the AVPN and B fields of the first dword of a HPTE,
* for use when we want to match an existing PTE. The bottom 7 bits
* of the returned value are zero.
*/
static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
int ssize)
{
unsigned long v;
v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
v <<= HPTE_V_AVPN_SHIFT;
v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
return v;
}
/*
* NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
* the low 3 bits of flags happen to line up. So no transform is needed.
* We can probably optimize here and assume the high bits of newpp are
* already zero. For now I am paranoid.
*/
static long pSeries_lpar_hpte_updatepp(unsigned long slot,
unsigned long newpp,
unsigned long va,
int psize, int ssize, int local)
{
unsigned long lpar_rc;
unsigned long flags = (newpp & 7) | H_AVPN;
unsigned long want_v;
want_v = hpte_encode_avpn(va, psize, ssize);
pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
want_v, slot, flags, psize);
lpar_rc = plpar_pte_protect(flags, slot, want_v);
if (lpar_rc == H_NOT_FOUND) {
pr_devel("not found !\n");
return -1;
}
pr_devel("ok\n");
BUG_ON(lpar_rc != H_SUCCESS);
return 0;
}
static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
{
unsigned long dword0;
unsigned long lpar_rc;
unsigned long dummy_word1;
unsigned long flags;
/* Read 1 pte at a time */
/* Do not need RPN to logical page translation */
/* No cross CEC PFT access */
flags = 0;
lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
BUG_ON(lpar_rc != H_SUCCESS);
return dword0;
}
static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize)
{
unsigned long hash;
unsigned long i;
long slot;
unsigned long want_v, hpte_v;
hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
want_v = hpte_encode_avpn(va, psize, ssize);
/* Bolted entries are always in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hpte_v = pSeries_lpar_hpte_getword0(slot);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
return slot;
++slot;
}
return -1;
}
static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
unsigned long ea,
int psize, int ssize)
{
unsigned long lpar_rc, slot, vsid, va, flags;
vsid = get_kernel_vsid(ea, ssize);
va = hpt_va(ea, vsid, ssize);
slot = pSeries_lpar_hpte_find(va, psize, ssize);
BUG_ON(slot == -1);
flags = newpp & 7;
lpar_rc = plpar_pte_protect(flags, slot, 0);
BUG_ON(lpar_rc != H_SUCCESS);
}
static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
int psize, int ssize, int local)
{
unsigned long want_v;
unsigned long lpar_rc;
unsigned long dummy1, dummy2;
pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
slot, va, psize, local);
want_v = hpte_encode_avpn(va, psize, ssize);
lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
if (lpar_rc == H_NOT_FOUND)
return;
BUG_ON(lpar_rc != H_SUCCESS);
}
static void pSeries_lpar_hpte_removebolted(unsigned long ea,
int psize, int ssize)
{
unsigned long slot, vsid, va;
vsid = get_kernel_vsid(ea, ssize);
va = hpt_va(ea, vsid, ssize);
slot = pSeries_lpar_hpte_find(va, psize, ssize);
BUG_ON(slot == -1);
pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0);
}
/* Flag bits for H_BULK_REMOVE */
#define HBR_REQUEST 0x4000000000000000UL
#define HBR_RESPONSE 0x8000000000000000UL
#define HBR_END 0xc000000000000000UL
#define HBR_AVPN 0x0200000000000000UL
#define HBR_ANDCOND 0x0100000000000000UL
/*
* Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
* lock.
*/
static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
{
unsigned long i, pix, rc;
unsigned long flags = 0;
struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
unsigned long param[9];
unsigned long va;
unsigned long hash, index, shift, hidx, slot;
real_pte_t pte;
int psize, ssize;
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
psize = batch->psize;
ssize = batch->ssize;
pix = 0;
for (i = 0; i < number; i++) {
va = batch->vaddr[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
hash = hpt_hash(va, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
pSeries_lpar_hpte_invalidate(slot, va, psize,
ssize, local);
} else {
param[pix] = HBR_REQUEST | HBR_AVPN | slot;
param[pix+1] = hpte_encode_avpn(va, psize,
ssize);
pix += 2;
if (pix == 8) {
rc = plpar_hcall9(H_BULK_REMOVE, param,
param[0], param[1], param[2],
param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
pix = 0;
}
}
} pte_iterate_hashed_end();
}
if (pix) {
param[pix] = HBR_END;
rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
param[2], param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
}
if (lock_tlbie)
spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}
static int __init disable_bulk_remove(char *str)
{
if (strcmp(str, "off") == 0 &&
firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
printk(KERN_INFO "Disabling BULK_REMOVE firmware feature");
powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
}
return 1;
}
__setup("bulk_remove=", disable_bulk_remove);
void __init hpte_init_lpar(void)
{
ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate;
ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp;
ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
ppc_md.hpte_insert = pSeries_lpar_hpte_insert;
ppc_md.hpte_remove = pSeries_lpar_hpte_remove;
ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear;
}
#ifdef CONFIG_PPC_SMLPAR
#define CMO_FREE_HINT_DEFAULT 1
static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
static int __init cmo_free_hint(char *str)
{
char *parm;
parm = strstrip(str);
if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n");
cmo_free_hint_flag = 0;
return 1;
}
cmo_free_hint_flag = 1;
printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n");
if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
return 1;
return 0;
}
__setup("cmo_free_hint=", cmo_free_hint);
static void pSeries_set_page_state(struct page *page, int order,
unsigned long state)
{
int i, j;
unsigned long cmo_page_sz, addr;
cmo_page_sz = cmo_get_page_size();
addr = __pa((unsigned long)page_address(page));
for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
}
}
void arch_free_page(struct page *page, int order)
{
if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
return;
pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
}
EXPORT_SYMBOL(arch_free_page);
#endif
#ifdef CONFIG_TRACEPOINTS
/*
* We optimise our hcall path by placing hcall_tracepoint_refcount
* directly in the TOC so we can check if the hcall tracepoints are
* enabled via a single load.
*/
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
extern long hcall_tracepoint_refcount;
/*
* Since the tracing code might execute hcalls we need to guard against
* recursion. One example of this are spinlocks calling H_YIELD on
* shared processor partitions.
*/
static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
void hcall_tracepoint_regfunc(void)
{
hcall_tracepoint_refcount++;
}
void hcall_tracepoint_unregfunc(void)
{
hcall_tracepoint_refcount--;
}
void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
{
unsigned long flags;
unsigned int *depth;
/*
* We cannot call tracepoints inside RCU idle regions which
* means we must not trace H_CEDE.
*/
if (opcode == H_CEDE)
return;
local_irq_save(flags);
depth = &__get_cpu_var(hcall_trace_depth);
if (*depth)
goto out;
(*depth)++;
preempt_disable();
trace_hcall_entry(opcode, args);
(*depth)--;
out:
local_irq_restore(flags);
}
void __trace_hcall_exit(long opcode, unsigned long retval,
unsigned long *retbuf)
{
unsigned long flags;
unsigned int *depth;
if (opcode == H_CEDE)
return;
local_irq_save(flags);
depth = &__get_cpu_var(hcall_trace_depth);
if (*depth)
goto out;
(*depth)++;
trace_hcall_exit(opcode, retval, retbuf);
preempt_enable();
(*depth)--;
out:
local_irq_restore(flags);
}
#endif
/**
* h_get_mpp
* H_GET_MPP hcall returns info in 7 parms
*/
int h_get_mpp(struct hvcall_mpp_data *mpp_data)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
rc = plpar_hcall9(H_GET_MPP, retbuf);
mpp_data->entitled_mem = retbuf[0];
mpp_data->mapped_mem = retbuf[1];
mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
mpp_data->pool_num = retbuf[2] & 0xffff;
mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
mpp_data->pool_size = retbuf[4];
mpp_data->loan_request = retbuf[5];
mpp_data->backing_mem = retbuf[6];
return rc;
}
EXPORT_SYMBOL(h_get_mpp);
int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
rc = plpar_hcall9(H_GET_MPP_X, retbuf);
mpp_x_data->coalesced_bytes = retbuf[0];
mpp_x_data->pool_coalesced_bytes = retbuf[1];
mpp_x_data->pool_purr_cycles = retbuf[2];
mpp_x_data->pool_spurr_cycles = retbuf[3];
return rc;
}
@@ -0,0 +1,363 @@
/*
* Support for Partition Mobility/Migration
*
* Copyright (C) 2010 Nathan Fontenot
* Copyright (C) 2010 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/smp.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <asm/rtas.h>
#include "pseries.h"
static struct kobject *mobility_kobj;
struct update_props_workarea {
u32 phandle;
u32 state;
u64 reserved;
u32 nprops;
};
#define NODE_ACTION_MASK 0xff000000
#define NODE_COUNT_MASK 0x00ffffff
#define DELETE_DT_NODE 0x01000000
#define UPDATE_DT_NODE 0x02000000
#define ADD_DT_NODE 0x03000000
static int mobility_rtas_call(int token, char *buf)
{
int rc;
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
return rc;
}
static int delete_dt_node(u32 phandle)
{
struct device_node *dn;
dn = of_find_node_by_phandle(phandle);
if (!dn)
return -ENOENT;
dlpar_detach_node(dn);
return 0;
}
static int update_dt_property(struct device_node *dn, struct property **prop,
const char *name, u32 vd, char *value)
{
struct property *new_prop = *prop;
struct property *old_prop;
int more = 0;
/* A negative 'vd' value indicates that only part of the new property
* value is contained in the buffer and we need to call
* ibm,update-properties again to get the rest of the value.
*
* A negative value is also the two's compliment of the actual value.
*/
if (vd & 0x80000000) {
vd = ~vd + 1;
more = 1;
}
if (new_prop) {
/* partial property fixup */
char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
if (!new_data)
return -ENOMEM;
memcpy(new_data, new_prop->value, new_prop->length);
memcpy(new_data + new_prop->length, value, vd);
kfree(new_prop->value);
new_prop->value = new_data;
new_prop->length += vd;
} else {
new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
if (!new_prop)
return -ENOMEM;
new_prop->name = kstrdup(name, GFP_KERNEL);
if (!new_prop->name) {
kfree(new_prop);
return -ENOMEM;
}
new_prop->length = vd;
new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
if (!new_prop->value) {
kfree(new_prop->name);
kfree(new_prop);
return -ENOMEM;
}
memcpy(new_prop->value, value, vd);
*prop = new_prop;
}
if (!more) {
old_prop = of_find_property(dn, new_prop->name, NULL);
if (old_prop)
prom_update_property(dn, new_prop, old_prop);
else
prom_add_property(dn, new_prop);
new_prop = NULL;
}
return 0;
}
static int update_dt_node(u32 phandle)
{
struct update_props_workarea *upwa;
struct device_node *dn;
struct property *prop = NULL;
int i, rc;
char *prop_data;
char *rtas_buf;
int update_properties_token;
update_properties_token = rtas_token("ibm,update-properties");
if (update_properties_token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!rtas_buf)
return -ENOMEM;
dn = of_find_node_by_phandle(phandle);
if (!dn) {
kfree(rtas_buf);
return -ENOENT;
}
upwa = (struct update_props_workarea *)&rtas_buf[0];
upwa->phandle = phandle;
do {
rc = mobility_rtas_call(update_properties_token, rtas_buf);
if (rc < 0)
break;
prop_data = rtas_buf + sizeof(*upwa);
for (i = 0; i < upwa->nprops; i++) {
char *prop_name;
u32 vd;
prop_name = prop_data + 1;
prop_data += strlen(prop_name) + 1;
vd = *prop_data++;
switch (vd) {
case 0x00000000:
/* name only property, nothing to do */
break;
case 0x80000000:
prop = of_find_property(dn, prop_name, NULL);
prom_remove_property(dn, prop);
prop = NULL;
break;
default:
rc = update_dt_property(dn, &prop, prop_name,
vd, prop_data);
if (rc) {
printk(KERN_ERR "Could not update %s"
" property\n", prop_name);
}
prop_data += vd;
}
}
} while (rc == 1);
of_node_put(dn);
kfree(rtas_buf);
return 0;
}
static int add_dt_node(u32 parent_phandle, u32 drc_index)
{
struct device_node *dn;
struct device_node *parent_dn;
int rc;
dn = dlpar_configure_connector(drc_index);
if (!dn)
return -ENOENT;
parent_dn = of_find_node_by_phandle(parent_phandle);
if (!parent_dn) {
dlpar_free_cc_nodes(dn);
return -ENOENT;
}
dn->parent = parent_dn;
rc = dlpar_attach_node(dn);
if (rc)
dlpar_free_cc_nodes(dn);
of_node_put(parent_dn);
return rc;
}
static int pseries_devicetree_update(void)
{
char *rtas_buf;
u32 *data;
int update_nodes_token;
int rc;
update_nodes_token = rtas_token("ibm,update-nodes");
if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!rtas_buf)
return -ENOMEM;
do {
rc = mobility_rtas_call(update_nodes_token, rtas_buf);
if (rc && rc != 1)
break;
data = (u32 *)rtas_buf + 4;
while (*data & NODE_ACTION_MASK) {
int i;
u32 action = *data & NODE_ACTION_MASK;
int node_count = *data & NODE_COUNT_MASK;
data++;
for (i = 0; i < node_count; i++) {
u32 phandle = *data++;
u32 drc_index;
switch (action) {
case DELETE_DT_NODE:
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
update_dt_node(phandle);
break;
case ADD_DT_NODE:
drc_index = *data++;
add_dt_node(phandle, drc_index);
break;
}
}
}
} while (rc == 1);
kfree(rtas_buf);
return rc;
}
void post_mobility_fixup(void)
{
int rc;
int activate_fw_token;
rc = pseries_devicetree_update();
if (rc) {
printk(KERN_ERR "Initial post-mobility device tree update "
"failed: %d\n", rc);
return;
}
activate_fw_token = rtas_token("ibm,activate-firmware");
if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
printk(KERN_ERR "Could not make post-mobility "
"activate-fw call.\n");
return;
}
rc = rtas_call(activate_fw_token, 0, 1, NULL);
if (!rc) {
rc = pseries_devicetree_update();
if (rc)
printk(KERN_ERR "Secondary post-mobility device tree "
"update failed: %d\n", rc);
} else {
printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
return;
}
return;
}
static ssize_t migrate_store(struct class *class, struct class_attribute *attr,
const char *buf, size_t count)
{
struct rtas_args args;
u64 streamid;
int rc;
rc = strict_strtoull(buf, 0, &streamid);
if (rc)
return rc;
memset(&args, 0, sizeof(args));
args.token = rtas_token("ibm,suspend-me");
args.nargs = 2;
args.nret = 1;
args.args[0] = streamid >> 32 ;
args.args[1] = streamid & 0xffffffff;
args.rets = &args.args[args.nargs];
do {
args.rets[0] = 0;
rc = rtas_ibm_suspend_me(&args);
if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE)
ssleep(1);
} while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE);
if (rc)
return rc;
else if (args.rets[0])
return args.rets[0];
post_mobility_fixup();
return count;
}
static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store);
static int __init mobility_sysfs_init(void)
{
int rc;
mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
if (!mobility_kobj)
return -ENOMEM;
rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
return rc;
}
device_initcall(mobility_sysfs_init);
+491
View File
@@ -0,0 +1,491 @@
/*
* Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
* Copyright 2006-2007 Michael Ellerman, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2 of the
* License.
*
*/
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/msi.h>
#include <asm/rtas.h>
#include <asm/hw_irq.h>
#include <asm/ppc-pci.h>
static int query_token, change_token;
#define RTAS_QUERY_FN 0
#define RTAS_CHANGE_FN 1
#define RTAS_RESET_FN 2
#define RTAS_CHANGE_MSI_FN 3
#define RTAS_CHANGE_MSIX_FN 4
static struct pci_dn *get_pdn(struct pci_dev *pdev)
{
struct device_node *dn;
struct pci_dn *pdn;
dn = pci_device_to_OF_node(pdev);
if (!dn) {
dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n");
return NULL;
}
pdn = PCI_DN(dn);
if (!pdn) {
dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n");
return NULL;
}
return pdn;
}
/* RTAS Helpers */
static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
{
u32 addr, seq_num, rtas_ret[3];
unsigned long buid;
int rc;
addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
buid = pdn->phb->buid;
seq_num = 1;
do {
if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN)
rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid),
func, num_irqs, seq_num);
else
rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid),
func, num_irqs, seq_num);
seq_num = rtas_ret[1];
} while (rtas_busy_delay(rc));
/*
* If the RTAS call succeeded, return the number of irqs allocated.
* If not, make sure we return a negative error code.
*/
if (rc == 0)
rc = rtas_ret[0];
else if (rc > 0)
rc = -rc;
pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
func, num_irqs, rtas_ret[0], rc);
return rc;
}
static void rtas_disable_msi(struct pci_dev *pdev)
{
struct pci_dn *pdn;
pdn = get_pdn(pdev);
if (!pdn)
return;
/*
* disabling MSI with the explicit interface also disables MSI-X
*/
if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
/*
* may have failed because explicit interface is not
* present
*/
if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
}
}
}
static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
{
u32 addr, rtas_ret[2];
unsigned long buid;
int rc;
addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
buid = pdn->phb->buid;
do {
rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid), offset);
} while (rtas_busy_delay(rc));
if (rc) {
pr_debug("rtas_msi: error (%d) querying source number\n", rc);
return rc;
}
return rtas_ret[0];
}
static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
{
struct msi_desc *entry;
list_for_each_entry(entry, &pdev->msi_list, list) {
if (entry->irq == NO_IRQ)
continue;
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
}
rtas_disable_msi(pdev);
}
static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
{
struct device_node *dn;
struct pci_dn *pdn;
const u32 *req_msi;
pdn = get_pdn(pdev);
if (!pdn)
return -ENODEV;
dn = pdn->node;
req_msi = of_get_property(dn, prop_name, NULL);
if (!req_msi) {
pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name);
return -ENOENT;
}
if (*req_msi < nvec) {
pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
if (*req_msi == 0) /* Be paranoid */
return -ENOSPC;
return *req_msi;
}
return 0;
}
static int check_req_msi(struct pci_dev *pdev, int nvec)
{
return check_req(pdev, nvec, "ibm,req#msi");
}
static int check_req_msix(struct pci_dev *pdev, int nvec)
{
return check_req(pdev, nvec, "ibm,req#msi-x");
}
/* Quota calculation */
static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
{
struct device_node *dn;
const u32 *p;
dn = of_node_get(pci_device_to_OF_node(dev));
while (dn) {
p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
if (p) {
pr_debug("rtas_msi: found prop on dn %s\n",
dn->full_name);
*total = *p;
return dn;
}
dn = of_get_next_parent(dn);
}
return NULL;
}
static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
{
struct device_node *dn;
/* Found our PE and assume 8 at that point. */
dn = pci_device_to_OF_node(dev);
if (!dn)
return NULL;
dn = eeh_find_device_pe(dn);
if (!dn)
return NULL;
/* We actually want the parent */
dn = of_get_parent(dn);
if (!dn)
return NULL;
/* Hardcode of 8 for old firmwares */
*total = 8;
pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
return dn;
}
struct msi_counts {
struct device_node *requestor;
int num_devices;
int request;
int quota;
int spare;
int over_quota;
};
static void *count_non_bridge_devices(struct device_node *dn, void *data)
{
struct msi_counts *counts = data;
const u32 *p;
u32 class;
pr_debug("rtas_msi: counting %s\n", dn->full_name);
p = of_get_property(dn, "class-code", NULL);
class = p ? *p : 0;
if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
counts->num_devices++;
return NULL;
}
static void *count_spare_msis(struct device_node *dn, void *data)
{
struct msi_counts *counts = data;
const u32 *p;
int req;
if (dn == counts->requestor)
req = counts->request;
else {
/* We don't know if a driver will try to use MSI or MSI-X,
* so we just have to punt and use the larger of the two. */
req = 0;
p = of_get_property(dn, "ibm,req#msi", NULL);
if (p)
req = *p;
p = of_get_property(dn, "ibm,req#msi-x", NULL);
if (p)
req = max(req, (int)*p);
}
if (req < counts->quota)
counts->spare += counts->quota - req;
else if (req > counts->quota)
counts->over_quota++;
return NULL;
}
static int msi_quota_for_device(struct pci_dev *dev, int request)
{
struct device_node *pe_dn;
struct msi_counts counts;
int total;
pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
request);
pe_dn = find_pe_total_msi(dev, &total);
if (!pe_dn)
pe_dn = find_pe_dn(dev, &total);
if (!pe_dn) {
pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
goto out;
}
pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
memset(&counts, 0, sizeof(struct msi_counts));
/* Work out how many devices we have below this PE */
traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
if (counts.num_devices == 0) {
pr_err("rtas_msi: found 0 devices under PE for %s\n",
pci_name(dev));
goto out;
}
counts.quota = total / counts.num_devices;
if (request <= counts.quota)
goto out;
/* else, we have some more calculating to do */
counts.requestor = pci_device_to_OF_node(dev);
counts.request = request;
traverse_pci_devices(pe_dn, count_spare_msis, &counts);
/* If the quota isn't an integer multiple of the total, we can
* use the remainder as spare MSIs for anyone that wants them. */
counts.spare += total % counts.num_devices;
/* Divide any spare by the number of over-quota requestors */
if (counts.over_quota)
counts.quota += counts.spare / counts.over_quota;
/* And finally clamp the request to the possibly adjusted quota */
request = min(counts.quota, request);
pr_debug("rtas_msi: request clamped to quota %d\n", request);
out:
of_node_put(pe_dn);
return request;
}
static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
{
int quota, rc;
if (type == PCI_CAP_ID_MSIX)
rc = check_req_msix(pdev, nvec);
else
rc = check_req_msi(pdev, nvec);
if (rc)
return rc;
quota = msi_quota_for_device(pdev, nvec);
if (quota && quota < nvec)
return quota;
return 0;
}
static int check_msix_entries(struct pci_dev *pdev)
{
struct msi_desc *entry;
int expected;
/* There's no way for us to express to firmware that we want
* a discontiguous, or non-zero based, range of MSI-X entries.
* So we must reject such requests. */
expected = 0;
list_for_each_entry(entry, &pdev->msi_list, list) {
if (entry->msi_attrib.entry_nr != expected) {
pr_debug("rtas_msi: bad MSI-X entries.\n");
return -EINVAL;
}
expected++;
}
return 0;
}
static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
{
struct pci_dn *pdn;
int hwirq, virq, i, rc;
struct msi_desc *entry;
struct msi_msg msg;
pdn = get_pdn(pdev);
if (!pdn)
return -ENODEV;
if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
return -EINVAL;
/*
* Try the new more explicit firmware interface, if that fails fall
* back to the old interface. The old interface is known to never
* return MSI-Xs.
*/
if (type == PCI_CAP_ID_MSI) {
rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
if (rc < 0) {
pr_debug("rtas_msi: trying the old firmware call.\n");
rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
}
} else
rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
if (rc != nvec) {
pr_debug("rtas_msi: rtas_change_msi() failed\n");
return rc;
}
i = 0;
list_for_each_entry(entry, &pdev->msi_list, list) {
hwirq = rtas_query_irq_number(pdn, i++);
if (hwirq < 0) {
pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
return hwirq;
}
virq = irq_create_mapping(NULL, hwirq);
if (virq == NO_IRQ) {
pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
return -ENOSPC;
}
dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
irq_set_msi_desc(virq, entry);
/* Read config space back so we can restore after reset */
read_msi_msg(virq, &msg);
entry->msg = msg;
}
return 0;
}
static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
{
/* No LSI -> leave MSIs (if any) configured */
if (pdev->irq == NO_IRQ) {
dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
return;
}
/* No MSI -> MSIs can't have been assigned by fw, leave LSI */
if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
return;
}
dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
rtas_disable_msi(pdev);
}
static int rtas_msi_init(void)
{
query_token = rtas_token("ibm,query-interrupt-source-number");
change_token = rtas_token("ibm,change-msi");
if ((query_token == RTAS_UNKNOWN_SERVICE) ||
(change_token == RTAS_UNKNOWN_SERVICE)) {
pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
return -1;
}
pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
WARN_ON(ppc_md.setup_msi_irqs);
ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
ppc_md.msi_check_device = rtas_msi_check_device;
WARN_ON(ppc_md.pci_irq_fixup);
ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
return 0;
}
arch_initcall(rtas_msi_init);
@@ -0,0 +1,679 @@
/*
* c 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* /dev/nvram driver for PPC64
*
* This perhaps should live in drivers/char
*/
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/kmsg_dump.h>
#include <linux/ctype.h>
#include <linux/zlib.h>
#include <asm/uaccess.h>
#include <asm/nvram.h>
#include <asm/rtas.h>
#include <asm/prom.h>
#include <asm/machdep.h>
/* Max bytes to read/write in one go */
#define NVRW_CNT 0x20
static unsigned int nvram_size;
static int nvram_fetch, nvram_store;
static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
static DEFINE_SPINLOCK(nvram_lock);
struct err_log_info {
int error_type;
unsigned int seq_num;
};
struct nvram_os_partition {
const char *name;
int req_size; /* desired size, in bytes */
int min_size; /* minimum acceptable size (0 means req_size) */
long size; /* size of data portion (excluding err_log_info) */
long index; /* offset of data portion of partition */
};
static struct nvram_os_partition rtas_log_partition = {
.name = "ibm,rtas-log",
.req_size = 2079,
.min_size = 1055,
.index = -1
};
static struct nvram_os_partition oops_log_partition = {
.name = "lnx,oops-log",
.req_size = 4000,
.min_size = 2000,
.index = -1
};
static const char *pseries_nvram_os_partitions[] = {
"ibm,rtas-log",
"lnx,oops-log",
NULL
};
static void oops_to_nvram(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason,
const char *old_msgs, unsigned long old_len,
const char *new_msgs, unsigned long new_len);
static struct kmsg_dumper nvram_kmsg_dumper = {
.dump = oops_to_nvram
};
/* See clobbering_unread_rtas_event() */
#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */
static unsigned long last_unread_rtas_event; /* timestamp */
/*
* For capturing and compressing an oops or panic report...
* big_oops_buf[] holds the uncompressed text we're capturing.
*
* oops_buf[] holds the compressed text, preceded by a prefix.
* The prefix is just a u16 holding the length of the compressed* text.
* (*Or uncompressed, if compression fails.) oops_buf[] gets written
* to NVRAM.
*
* oops_len points to the prefix. oops_data points to the compressed text.
*
* +- oops_buf
* | +- oops_data
* v v
* +------------+-----------------------------------------------+
* | length | text |
* | (2 bytes) | (oops_data_sz bytes) |
* +------------+-----------------------------------------------+
* ^
* +- oops_len
*
* We preallocate these buffers during init to avoid kmalloc during oops/panic.
*/
static size_t big_oops_buf_sz;
static char *big_oops_buf, *oops_buf;
static u16 *oops_len;
static char *oops_data;
static size_t oops_data_sz;
/* Compression parameters */
#define COMPR_LEVEL 6
#define WINDOW_BITS 12
#define MEM_LEVEL 4
static struct z_stream_s stream;
static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
{
unsigned int i;
unsigned long len;
int done;
unsigned long flags;
char *p = buf;
if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
if (*index >= nvram_size)
return 0;
i = *index;
if (i + count > nvram_size)
count = nvram_size - i;
spin_lock_irqsave(&nvram_lock, flags);
for (; count != 0; count -= len) {
len = count;
if (len > NVRW_CNT)
len = NVRW_CNT;
if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
len) != 0) || len != done) {
spin_unlock_irqrestore(&nvram_lock, flags);
return -EIO;
}
memcpy(p, nvram_buf, len);
p += len;
i += len;
}
spin_unlock_irqrestore(&nvram_lock, flags);
*index = i;
return p - buf;
}
static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
{
unsigned int i;
unsigned long len;
int done;
unsigned long flags;
const char *p = buf;
if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
if (*index >= nvram_size)
return 0;
i = *index;
if (i + count > nvram_size)
count = nvram_size - i;
spin_lock_irqsave(&nvram_lock, flags);
for (; count != 0; count -= len) {
len = count;
if (len > NVRW_CNT)
len = NVRW_CNT;
memcpy(nvram_buf, p, len);
if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
len) != 0) || len != done) {
spin_unlock_irqrestore(&nvram_lock, flags);
return -EIO;
}
p += len;
i += len;
}
spin_unlock_irqrestore(&nvram_lock, flags);
*index = i;
return p - buf;
}
static ssize_t pSeries_nvram_get_size(void)
{
return nvram_size ? nvram_size : -ENODEV;
}
/* nvram_write_os_partition, nvram_write_error_log
*
* We need to buffer the error logs into nvram to ensure that we have
* the failure information to decode. If we have a severe error there
* is no way to guarantee that the OS or the machine is in a state to
* get back to user land and write the error to disk. For example if
* the SCSI device driver causes a Machine Check by writing to a bad
* IO address, there is no way of guaranteeing that the device driver
* is in any state that is would also be able to write the error data
* captured to disk, thus we buffer it in NVRAM for analysis on the
* next boot.
*
* In NVRAM the partition containing the error log buffer will looks like:
* Header (in bytes):
* +-----------+----------+--------+------------+------------------+
* | signature | checksum | length | name | data |
* |0 |1 |2 3|4 15|16 length-1|
* +-----------+----------+--------+------------+------------------+
*
* The 'data' section would look like (in bytes):
* +--------------+------------+-----------------------------------+
* | event_logged | sequence # | error log |
* |0 3|4 7|8 error_log_size-1|
* +--------------+------------+-----------------------------------+
*
* event_logged: 0 if event has not been logged to syslog, 1 if it has
* sequence #: The unique sequence # for each event. (until it wraps)
* error log: The error log from event_scan
*/
int nvram_write_os_partition(struct nvram_os_partition *part, char * buff,
int length, unsigned int err_type, unsigned int error_log_cnt)
{
int rc;
loff_t tmp_index;
struct err_log_info info;
if (part->index == -1) {
return -ESPIPE;
}
if (length > part->size) {
length = part->size;
}
info.error_type = err_type;
info.seq_num = error_log_cnt;
tmp_index = part->index;
rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
return rc;
}
rc = ppc_md.nvram_write(buff, length, &tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
return rc;
}
return 0;
}
int nvram_write_error_log(char * buff, int length,
unsigned int err_type, unsigned int error_log_cnt)
{
int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
err_type, error_log_cnt);
if (!rc)
last_unread_rtas_event = get_seconds();
return rc;
}
/* nvram_read_error_log
*
* Reads nvram for error log for at most 'length'
*/
int nvram_read_error_log(char * buff, int length,
unsigned int * err_type, unsigned int * error_log_cnt)
{
int rc;
loff_t tmp_index;
struct err_log_info info;
if (rtas_log_partition.index == -1)
return -1;
if (length > rtas_log_partition.size)
length = rtas_log_partition.size;
tmp_index = rtas_log_partition.index;
rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
if (rc <= 0) {
printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
return rc;
}
rc = ppc_md.nvram_read(buff, length, &tmp_index);
if (rc <= 0) {
printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
return rc;
}
*error_log_cnt = info.seq_num;
*err_type = info.error_type;
return 0;
}
/* This doesn't actually zero anything, but it sets the event_logged
* word to tell that this event is safely in syslog.
*/
int nvram_clear_error_log(void)
{
loff_t tmp_index;
int clear_word = ERR_FLAG_ALREADY_LOGGED;
int rc;
if (rtas_log_partition.index == -1)
return -1;
tmp_index = rtas_log_partition.index;
rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
if (rc <= 0) {
printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
return rc;
}
last_unread_rtas_event = 0;
return 0;
}
/* pseries_nvram_init_os_partition
*
* This sets up a partition with an "OS" signature.
*
* The general strategy is the following:
* 1.) If a partition with the indicated name already exists...
* - If it's large enough, use it.
* - Otherwise, recycle it and keep going.
* 2.) Search for a free partition that is large enough.
* 3.) If there's not a free partition large enough, recycle any obsolete
* OS partitions and try again.
* 4.) Will first try getting a chunk that will satisfy the requested size.
* 5.) If a chunk of the requested size cannot be allocated, then try finding
* a chunk that will satisfy the minum needed.
*
* Returns 0 on success, else -1.
*/
static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
*part)
{
loff_t p;
int size;
/* Scan nvram for partitions */
nvram_scan_partitions();
/* Look for ours */
p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
/* Found one but too small, remove it */
if (p && size < part->min_size) {
pr_info("nvram: Found too small %s partition,"
" removing it...\n", part->name);
nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
p = 0;
}
/* Create one if we didn't find */
if (!p) {
p = nvram_create_partition(part->name, NVRAM_SIG_OS,
part->req_size, part->min_size);
if (p == -ENOSPC) {
pr_info("nvram: No room to create %s partition, "
"deleting any obsolete OS partitions...\n",
part->name);
nvram_remove_partition(NULL, NVRAM_SIG_OS,
pseries_nvram_os_partitions);
p = nvram_create_partition(part->name, NVRAM_SIG_OS,
part->req_size, part->min_size);
}
}
if (p <= 0) {
pr_err("nvram: Failed to find or create %s"
" partition, err %d\n", part->name, (int)p);
return -1;
}
part->index = p;
part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
return 0;
}
static void __init nvram_init_oops_partition(int rtas_partition_exists)
{
int rc;
rc = pseries_nvram_init_os_partition(&oops_log_partition);
if (rc != 0) {
if (!rtas_partition_exists)
return;
pr_notice("nvram: Using %s partition to log both"
" RTAS errors and oops/panic reports\n",
rtas_log_partition.name);
memcpy(&oops_log_partition, &rtas_log_partition,
sizeof(rtas_log_partition));
}
oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
if (!oops_buf) {
pr_err("nvram: No memory for %s partition\n",
oops_log_partition.name);
return;
}
oops_len = (u16*) oops_buf;
oops_data = oops_buf + sizeof(u16);
oops_data_sz = oops_log_partition.size - sizeof(u16);
/*
* Figure compression (preceded by elimination of each line's <n>
* severity prefix) will reduce the oops/panic report to at most
* 45% of its original size.
*/
big_oops_buf_sz = (oops_data_sz * 100) / 45;
big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
if (big_oops_buf) {
stream.workspace = kmalloc(zlib_deflate_workspacesize(
WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
if (!stream.workspace) {
pr_err("nvram: No memory for compression workspace; "
"skipping compression of %s partition data\n",
oops_log_partition.name);
kfree(big_oops_buf);
big_oops_buf = NULL;
}
} else {
pr_err("No memory for uncompressed %s data; "
"skipping compression\n", oops_log_partition.name);
stream.workspace = NULL;
}
rc = kmsg_dump_register(&nvram_kmsg_dumper);
if (rc != 0) {
pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
kfree(oops_buf);
kfree(big_oops_buf);
kfree(stream.workspace);
}
}
static int __init pseries_nvram_init_log_partitions(void)
{
int rc;
rc = pseries_nvram_init_os_partition(&rtas_log_partition);
nvram_init_oops_partition(rc == 0);
return 0;
}
machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
int __init pSeries_nvram_init(void)
{
struct device_node *nvram;
const unsigned int *nbytes_p;
unsigned int proplen;
nvram = of_find_node_by_type(NULL, "nvram");
if (nvram == NULL)
return -ENODEV;
nbytes_p = of_get_property(nvram, "#bytes", &proplen);
if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
of_node_put(nvram);
return -EIO;
}
nvram_size = *nbytes_p;
nvram_fetch = rtas_token("nvram-fetch");
nvram_store = rtas_token("nvram-store");
printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
of_node_put(nvram);
ppc_md.nvram_read = pSeries_nvram_read;
ppc_md.nvram_write = pSeries_nvram_write;
ppc_md.nvram_size = pSeries_nvram_get_size;
return 0;
}
/*
* Try to capture the last capture_len bytes of the printk buffer. Return
* the amount actually captured.
*/
static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
const char *new_msgs, size_t new_len,
char *captured, size_t capture_len)
{
if (new_len >= capture_len) {
memcpy(captured, new_msgs + (new_len - capture_len),
capture_len);
return capture_len;
} else {
/* Grab the end of old_msgs. */
size_t old_tail_len = min(old_len, capture_len - new_len);
memcpy(captured, old_msgs + (old_len - old_tail_len),
old_tail_len);
memcpy(captured + old_tail_len, new_msgs, new_len);
return old_tail_len + new_len;
}
}
/*
* Are we using the ibm,rtas-log for oops/panic reports? And if so,
* would logging this oops/panic overwrite an RTAS event that rtas_errd
* hasn't had a chance to read and process? Return 1 if so, else 0.
*
* We assume that if rtas_errd hasn't read the RTAS event in
* NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
*/
static int clobbering_unread_rtas_event(void)
{
return (oops_log_partition.index == rtas_log_partition.index
&& last_unread_rtas_event
&& get_seconds() - last_unread_rtas_event <=
NVRAM_RTAS_READ_TIMEOUT);
}
/* Squeeze out each line's <n> severity prefix. */
static size_t elide_severities(char *buf, size_t len)
{
char *in, *out, *buf_end = buf + len;
/* Assume a <n> at the very beginning marks the start of a line. */
int newline = 1;
in = out = buf;
while (in < buf_end) {
if (newline && in+3 <= buf_end &&
*in == '<' && isdigit(in[1]) && in[2] == '>') {
in += 3;
newline = 0;
} else {
newline = (*in == '\n');
*out++ = *in++;
}
}
return out - buf;
}
/* Derived from logfs_compress() */
static int nvram_compress(const void *in, void *out, size_t inlen,
size_t outlen)
{
int err, ret;
ret = -EIO;
err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
MEM_LEVEL, Z_DEFAULT_STRATEGY);
if (err != Z_OK)
goto error;
stream.next_in = in;
stream.avail_in = inlen;
stream.total_in = 0;
stream.next_out = out;
stream.avail_out = outlen;
stream.total_out = 0;
err = zlib_deflate(&stream, Z_FINISH);
if (err != Z_STREAM_END)
goto error;
err = zlib_deflateEnd(&stream);
if (err != Z_OK)
goto error;
if (stream.total_out >= stream.total_in)
goto error;
ret = stream.total_out;
error:
return ret;
}
/* Compress the text from big_oops_buf into oops_buf. */
static int zip_oops(size_t text_len)
{
int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
oops_data_sz);
if (zipped_len < 0) {
pr_err("nvram: compression failed; returned %d\n", zipped_len);
pr_err("nvram: logging uncompressed oops/panic report\n");
return -1;
}
*oops_len = (u16) zipped_len;
return 0;
}
/*
* This is our kmsg_dump callback, called after an oops or panic report
* has been written to the printk buffer. We want to capture as much
* of the printk buffer as possible. First, capture as much as we can
* that we think will compress sufficiently to fit in the lnx,oops-log
* partition. If that's too much, go back and capture uncompressed text.
*/
static void oops_to_nvram(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason,
const char *old_msgs, unsigned long old_len,
const char *new_msgs, unsigned long new_len)
{
static unsigned int oops_count = 0;
static bool panicking = false;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
size_t text_len;
unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
int rc = -1;
switch (reason) {
case KMSG_DUMP_RESTART:
case KMSG_DUMP_HALT:
case KMSG_DUMP_POWEROFF:
/* These are almost always orderly shutdowns. */
return;
case KMSG_DUMP_OOPS:
break;
case KMSG_DUMP_PANIC:
panicking = true;
break;
case KMSG_DUMP_EMERG:
if (panicking)
/* Panic report already captured. */
return;
break;
default:
pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
__FUNCTION__, (int) reason);
return;
}
if (clobbering_unread_rtas_event())
return;
if (!spin_trylock_irqsave(&lock, flags))
return;
if (big_oops_buf) {
text_len = capture_last_msgs(old_msgs, old_len,
new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
text_len = elide_severities(big_oops_buf, text_len);
rc = zip_oops(text_len);
}
if (rc != 0) {
text_len = capture_last_msgs(old_msgs, old_len,
new_msgs, new_len, oops_data, oops_data_sz);
err_type = ERR_TYPE_KERNEL_PANIC;
*oops_len = (u16) text_len;
}
(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
(int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
spin_unlock_irqrestore(&lock, flags);
}
@@ -0,0 +1,37 @@
#ifndef _OFFLINE_STATES_H_
#define _OFFLINE_STATES_H_
/* Cpu offline states go here */
enum cpu_state_vals {
CPU_STATE_OFFLINE,
CPU_STATE_INACTIVE,
CPU_STATE_ONLINE,
CPU_MAX_OFFLINE_STATES
};
#ifdef CONFIG_HOTPLUG_CPU
extern enum cpu_state_vals get_cpu_current_state(int cpu);
extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
extern void set_default_offline_state(int cpu);
#else
static inline enum cpu_state_vals get_cpu_current_state(int cpu)
{
return CPU_STATE_ONLINE;
}
static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
{
}
static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
{
}
static inline void set_default_offline_state(int cpu)
{
}
#endif
extern enum cpu_state_vals get_preferred_offline_state(int cpu);
#endif
+109
View File
@@ -0,0 +1,109 @@
/*
* Copyright (C) 2001 Dave Engebretsen, IBM Corporation
* Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
*
* pSeries specific routines for PCI.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/string.h>
#include <asm/eeh.h>
#include <asm/pci-bridge.h>
#include <asm/prom.h>
#include <asm/ppc-pci.h>
#if 0
void pcibios_name_device(struct pci_dev *dev)
{
struct device_node *dn;
/*
* Add IBM loc code (slot) as a prefix to the device names for service
*/
dn = pci_device_to_OF_node(dev);
if (dn) {
const char *loc_code = of_get_property(dn, "ibm,loc-code", 0);
if (loc_code) {
int loc_len = strlen(loc_code);
if (loc_len < sizeof(dev->dev.name)) {
memmove(dev->dev.name+loc_len+1, dev->dev.name,
sizeof(dev->dev.name)-loc_len-1);
memcpy(dev->dev.name, loc_code, loc_len);
dev->dev.name[loc_len] = ' ';
dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
}
}
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
#endif
static void __init pSeries_request_regions(void)
{
if (!isa_io_base)
return;
request_region(0x20,0x20,"pic1");
request_region(0xa0,0x20,"pic2");
request_region(0x00,0x20,"dma1");
request_region(0x40,0x20,"timer");
request_region(0x80,0x10,"dma page reg");
request_region(0xc0,0x20,"dma2");
}
void __init pSeries_final_fixup(void)
{
pSeries_request_regions();
pci_addr_cache_build();
}
/*
* Assume the winbond 82c105 is the IDE controller on a
* p610/p615/p630. We should probably be more careful in case
* someone tries to plug in a similar adapter.
*/
static void fixup_winbond_82c105(struct pci_dev* dev)
{
int i;
unsigned int reg;
if (!machine_is(pseries))
return;
printk("Using INTC for W82c105 IDE controller.\n");
pci_read_config_dword(dev, 0x40, &reg);
/* Enable LEGIRQ to use INTC instead of ISA interrupts */
pci_write_config_dword(dev, 0x40, reg | (1<<11));
for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
/* zap the 2nd function of the winbond chip */
if (dev->resource[i].flags & IORESOURCE_IO
&& dev->bus->number == 0 && dev->devfn == 0x81)
dev->resource[i].flags &= ~IORESOURCE_IO;
if (dev->resource[i].start == 0 && dev->resource[i].end) {
dev->resource[i].flags = 0;
dev->resource[i].end = 0;
}
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
fixup_winbond_82c105);
@@ -0,0 +1,212 @@
/*
* PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
* for RPA-compliant PPC64 platform.
* Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
* Copyright (C) 2005 International Business Machines
*
* Updates, 2005, John Rose <johnrose@austin.ibm.com>
* Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/pci.h>
#include <linux/export.h>
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
#include <asm/firmware.h>
#include <asm/eeh.h>
static struct pci_bus *
find_bus_among_children(struct pci_bus *bus,
struct device_node *dn)
{
struct pci_bus *child = NULL;
struct list_head *tmp;
struct device_node *busdn;
busdn = pci_bus_to_OF_node(bus);
if (busdn == dn)
return bus;
list_for_each(tmp, &bus->children) {
child = find_bus_among_children(pci_bus_b(tmp), dn);
if (child)
break;
};
return child;
}
struct pci_bus *
pcibios_find_pci_bus(struct device_node *dn)
{
struct pci_dn *pdn = dn->data;
if (!pdn || !pdn->phb || !pdn->phb->bus)
return NULL;
return find_bus_among_children(pdn->phb->bus, dn);
}
EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
/**
* pcibios_remove_pci_devices - remove all devices under this bus
*
* Remove all of the PCI devices under this bus both from the
* linux pci device tree, and from the powerpc EEH address cache.
*/
void pcibios_remove_pci_devices(struct pci_bus *bus)
{
struct pci_dev *dev, *tmp;
struct pci_bus *child_bus;
/* First go down child busses */
list_for_each_entry(child_bus, &bus->children, node)
pcibios_remove_pci_devices(child_bus);
pr_debug("PCI: Removing devices on bus %04x:%02x\n",
pci_domain_nr(bus), bus->number);
list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
pr_debug(" * Removing %s...\n", pci_name(dev));
eeh_remove_bus_device(dev);
pci_stop_and_remove_bus_device(dev);
}
}
EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
/**
* pcibios_add_pci_devices - adds new pci devices to bus
*
* This routine will find and fixup new pci devices under
* the indicated bus. This routine presumes that there
* might already be some devices under this bridge, so
* it carefully tries to add only new devices. (And that
* is how this routine differs from other, similar pcibios
* routines.)
*/
void pcibios_add_pci_devices(struct pci_bus * bus)
{
int slotno, num, mode, pass, max;
struct pci_dev *dev;
struct device_node *dn = pci_bus_to_OF_node(bus);
eeh_add_device_tree_early(dn);
mode = PCI_PROBE_NORMAL;
if (ppc_md.pci_probe_mode)
mode = ppc_md.pci_probe_mode(bus);
if (mode == PCI_PROBE_DEVTREE) {
/* use ofdt-based probe */
of_rescan_bus(dn, bus);
} else if (mode == PCI_PROBE_NORMAL) {
/* use legacy probe */
slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
if (!num)
return;
pcibios_setup_bus_devices(bus);
max = bus->secondary;
for (pass=0; pass < 2; pass++)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
max = pci_scan_bridge(bus, dev, max, pass);
}
}
pcibios_finish_adding_to_bus(bus);
}
EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
{
struct pci_controller *phb;
pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name);
phb = pcibios_alloc_controller(dn);
if (!phb)
return NULL;
rtas_setup_phb(phb);
pci_process_bridge_OF_ranges(phb, dn, 0);
pci_devs_phb_init_dynamic(phb);
/* Create EEH devices for the PHB */
eeh_dev_phb_init_dynamic(phb);
if (dn->child)
eeh_add_device_tree_early(dn);
pcibios_scan_phb(phb);
pcibios_finish_adding_to_bus(phb->bus);
return phb;
}
EXPORT_SYMBOL_GPL(init_phb_dynamic);
/* RPA-specific bits for removing PHBs */
int remove_phb_dynamic(struct pci_controller *phb)
{
struct pci_bus *b = phb->bus;
struct resource *res;
int rc, i;
pr_debug("PCI: Removing PHB %04x:%02x...\n",
pci_domain_nr(b), b->number);
/* We cannot to remove a root bus that has children */
if (!(list_empty(&b->children) && list_empty(&b->devices)))
return -EBUSY;
/* We -know- there aren't any child devices anymore at this stage
* and thus, we can safely unmap the IO space as it's not in use
*/
res = &phb->io_resource;
if (res->flags & IORESOURCE_IO) {
rc = pcibios_unmap_io_space(b);
if (rc) {
printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
__func__, b->name);
return 1;
}
}
/* Unregister the bridge device from sysfs and remove the PCI bus */
device_unregister(b->bridge);
phb->bus = NULL;
pci_remove_bus(b);
/* Now release the IO resource */
if (res->flags & IORESOURCE_IO)
release_resource(res);
/* Release memory resources */
for (i = 0; i < 3; ++i) {
res = &phb->mem_resources[i];
if (!(res->flags & IORESOURCE_MEM))
continue;
release_resource(res);
}
/* Free pci_controller data structure */
pcibios_free_controller(phb);
return 0;
}
EXPORT_SYMBOL_GPL(remove_phb_dynamic);
@@ -0,0 +1,276 @@
#ifndef _PSERIES_PLPAR_WRAPPERS_H
#define _PSERIES_PLPAR_WRAPPERS_H
#include <linux/string.h>
#include <asm/hvcall.h>
#include <asm/paca.h>
#include <asm/page.h>
/* Get state of physical CPU from query_cpu_stopped */
int smp_query_cpu_stopped(unsigned int pcpu);
#define QCSS_STOPPED 0
#define QCSS_STOPPING 1
#define QCSS_NOT_STOPPED 2
#define QCSS_HARDWARE_ERROR -1
#define QCSS_HARDWARE_BUSY -2
static inline long poll_pending(void)
{
return plpar_hcall_norets(H_POLL_PENDING);
}
static inline u8 get_cede_latency_hint(void)
{
return get_lppaca()->gpr5_dword.fields.cede_latency_hint;
}
static inline void set_cede_latency_hint(u8 latency_hint)
{
get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint;
}
static inline long cede_processor(void)
{
return plpar_hcall_norets(H_CEDE);
}
static inline long extended_cede_processor(unsigned long latency_hint)
{
long rc;
u8 old_latency_hint = get_cede_latency_hint();
set_cede_latency_hint(latency_hint);
rc = cede_processor();
set_cede_latency_hint(old_latency_hint);
return rc;
}
static inline long vpa_call(unsigned long flags, unsigned long cpu,
unsigned long vpa)
{
/* flags are in bits 16-18 (counting from most significant bit) */
flags = flags << (63 - 18);
return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
}
static inline long unregister_vpa(unsigned long cpu)
{
return vpa_call(0x5, cpu, 0);
}
static inline long register_vpa(unsigned long cpu, unsigned long vpa)
{
return vpa_call(0x1, cpu, vpa);
}
static inline long unregister_slb_shadow(unsigned long cpu)
{
return vpa_call(0x7, cpu, 0);
}
static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
{
return vpa_call(0x3, cpu, vpa);
}
static inline long unregister_dtl(unsigned long cpu)
{
return vpa_call(0x6, cpu, 0);
}
static inline long register_dtl(unsigned long cpu, unsigned long vpa)
{
return vpa_call(0x2, cpu, vpa);
}
static inline long plpar_page_set_loaned(unsigned long vpa)
{
unsigned long cmo_page_sz = cmo_get_page_size();
long rc = 0;
int i;
for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
vpa + i - cmo_page_sz, 0);
return rc;
}
static inline long plpar_page_set_active(unsigned long vpa)
{
unsigned long cmo_page_sz = cmo_get_page_size();
long rc = 0;
int i;
for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
vpa + i - cmo_page_sz, 0);
return rc;
}
extern void vpa_init(int cpu);
static inline long plpar_pte_enter(unsigned long flags,
unsigned long hpte_group, unsigned long hpte_v,
unsigned long hpte_r, unsigned long *slot)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r);
*slot = retbuf[0];
return rc;
}
static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
unsigned long avpn, unsigned long *old_pteh_ret,
unsigned long *old_ptel_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn);
*old_pteh_ret = retbuf[0];
*old_ptel_ret = retbuf[1];
return rc;
}
/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */
static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex,
unsigned long avpn, unsigned long *old_pteh_ret,
unsigned long *old_ptel_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn);
*old_pteh_ret = retbuf[0];
*old_ptel_ret = retbuf[1];
return rc;
}
static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_READ, retbuf, flags, ptex);
*old_pteh_ret = retbuf[0];
*old_ptel_ret = retbuf[1];
return rc;
}
/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */
static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex,
unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex);
*old_pteh_ret = retbuf[0];
*old_ptel_ret = retbuf[1];
return rc;
}
/*
* plpar_pte_read_4_raw can be called in real mode.
* ptes must be 8*sizeof(unsigned long)
*/
static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex,
unsigned long *ptes)
{
long rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex);
memcpy(ptes, retbuf, 8*sizeof(unsigned long));
return rc;
}
static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
unsigned long avpn)
{
return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
}
static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
unsigned long *tce_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba);
*tce_ret = retbuf[0];
return rc;
}
static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
unsigned long tceval)
{
return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
}
static inline long plpar_tce_put_indirect(unsigned long liobn,
unsigned long ioba, unsigned long page, unsigned long count)
{
return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
}
static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
unsigned long tceval, unsigned long count)
{
return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
}
static inline long plpar_get_term_char(unsigned long termno,
unsigned long *len_ret, char *buf_ret)
{
long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */
rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno);
*len_ret = retbuf[0];
lbuf[0] = retbuf[1];
lbuf[1] = retbuf[2];
return rc;
}
static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
const char *buffer)
{
unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */
return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
lbuf[1]);
}
#endif /* _PSERIES_PLPAR_WRAPPERS_H */
@@ -0,0 +1,81 @@
/*
* Interface for power-management for ppc64 compliant platform
*
* Manish Ahuja <mahuja@us.ibm.com>
*
* Feb 2007
*
* Copyright (C) 2007 IBM Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/init.h>
unsigned long rtas_poweron_auto; /* default and normal state is 0 */
static ssize_t auto_poweron_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", rtas_poweron_auto);
}
static ssize_t auto_poweron_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t n)
{
int ret;
unsigned long ups_restart;
ret = sscanf(buf, "%lu", &ups_restart);
if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
rtas_poweron_auto = ups_restart;
return n;
}
return -EINVAL;
}
static struct kobj_attribute auto_poweron_attr =
__ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
#ifndef CONFIG_PM
struct kobject *power_kobj;
static struct attribute *g[] = {
&auto_poweron_attr.attr,
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_init(void)
{
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_init);
#else
static int __init apo_pm_init(void)
{
return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
}
__initcall(apo_pm_init);
#endif
@@ -0,0 +1,343 @@
/*
* processor_idle - idle state cpuidle driver.
* Adapted from drivers/idle/intel_idle.c and
* drivers/acpi/processor_idle.c
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/cpuidle.h>
#include <linux/cpu.h>
#include <asm/paca.h>
#include <asm/reg.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/runlatch.h>
#include "plpar_wrappers.h"
#include "pseries.h"
struct cpuidle_driver pseries_idle_driver = {
.name = "pseries_idle",
.owner = THIS_MODULE,
};
#define MAX_IDLE_STATE_COUNT 2
static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
static struct cpuidle_device __percpu *pseries_cpuidle_devices;
static struct cpuidle_state *cpuidle_state_table;
void update_smt_snooze_delay(int snooze)
{
struct cpuidle_driver *drv = cpuidle_get_driver();
if (drv)
drv->states[0].target_residency = snooze;
}
static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t *kt_before)
{
*kt_before = ktime_get_real();
*in_purr = mfspr(SPRN_PURR);
/*
* Indicate to the HV that we are idle. Now would be
* a good time to find other work to dispatch.
*/
get_lppaca()->idle = 1;
}
static inline s64 idle_loop_epilog(unsigned long in_purr, ktime_t kt_before)
{
get_lppaca()->wait_state_cycles += mfspr(SPRN_PURR) - in_purr;
get_lppaca()->idle = 0;
return ktime_to_us(ktime_sub(ktime_get_real(), kt_before));
}
static int snooze_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
unsigned long in_purr;
ktime_t kt_before;
unsigned long start_snooze;
long snooze = drv->states[0].target_residency;
idle_loop_prolog(&in_purr, &kt_before);
if (snooze) {
start_snooze = get_tb() + snooze * tb_ticks_per_usec;
local_irq_enable();
set_thread_flag(TIF_POLLING_NRFLAG);
while ((snooze < 0) || (get_tb() < start_snooze)) {
if (need_resched() || cpu_is_offline(dev->cpu))
goto out;
ppc64_runlatch_off();
HMT_low();
HMT_very_low();
}
HMT_medium();
clear_thread_flag(TIF_POLLING_NRFLAG);
smp_mb();
local_irq_disable();
}
out:
HMT_medium();
dev->last_residency =
(int)idle_loop_epilog(in_purr, kt_before);
return index;
}
static void check_and_cede_processor(void)
{
/*
* Interrupts are soft-disabled at this point,
* but not hard disabled. So an interrupt might have
* occurred before entering NAP, and would be potentially
* lost (edge events, decrementer events, etc...) unless
* we first hard disable then check.
*/
hard_irq_disable();
if (get_paca()->irq_happened == 0)
cede_processor();
}
static int dedicated_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
unsigned long in_purr;
ktime_t kt_before;
idle_loop_prolog(&in_purr, &kt_before);
get_lppaca()->donate_dedicated_cpu = 1;
ppc64_runlatch_off();
HMT_medium();
check_and_cede_processor();
get_lppaca()->donate_dedicated_cpu = 0;
dev->last_residency =
(int)idle_loop_epilog(in_purr, kt_before);
return index;
}
static int shared_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
unsigned long in_purr;
ktime_t kt_before;
idle_loop_prolog(&in_purr, &kt_before);
/*
* Yield the processor to the hypervisor. We return if
* an external interrupt occurs (which are driven prior
* to returning here) or if a prod occurs from another
* processor. When returning here, external interrupts
* are enabled.
*/
check_and_cede_processor();
dev->last_residency =
(int)idle_loop_epilog(in_purr, kt_before);
return index;
}
/*
* States for dedicated partition case.
*/
static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
.flags = CPUIDLE_FLAG_TIME_VALID,
.exit_latency = 0,
.target_residency = 0,
.enter = &snooze_loop },
{ /* CEDE */
.name = "CEDE",
.desc = "CEDE",
.flags = CPUIDLE_FLAG_TIME_VALID,
.exit_latency = 1,
.target_residency = 10,
.enter = &dedicated_cede_loop },
};
/*
* States for shared partition case.
*/
static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
{ /* Shared Cede */
.name = "Shared Cede",
.desc = "Shared Cede",
.flags = CPUIDLE_FLAG_TIME_VALID,
.exit_latency = 0,
.target_residency = 0,
.enter = &shared_cede_loop },
};
int pseries_notify_cpuidle_add_cpu(int cpu)
{
struct cpuidle_device *dev =
per_cpu_ptr(pseries_cpuidle_devices, cpu);
if (dev && cpuidle_get_driver()) {
cpuidle_disable_device(dev);
cpuidle_enable_device(dev);
}
return 0;
}
/*
* pseries_cpuidle_driver_init()
*/
static int pseries_cpuidle_driver_init(void)
{
int idle_state;
struct cpuidle_driver *drv = &pseries_idle_driver;
drv->state_count = 0;
for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
if (idle_state > max_idle_state)
break;
/* is the state not enabled? */
if (cpuidle_state_table[idle_state].enter == NULL)
continue;
drv->states[drv->state_count] = /* structure copy */
cpuidle_state_table[idle_state];
if (cpuidle_state_table == dedicated_states)
drv->states[drv->state_count].target_residency =
__get_cpu_var(smt_snooze_delay);
drv->state_count += 1;
}
return 0;
}
/* pseries_idle_devices_uninit(void)
* unregister cpuidle devices and de-allocate memory
*/
static void pseries_idle_devices_uninit(void)
{
int i;
struct cpuidle_device *dev;
for_each_possible_cpu(i) {
dev = per_cpu_ptr(pseries_cpuidle_devices, i);
cpuidle_unregister_device(dev);
}
free_percpu(pseries_cpuidle_devices);
return;
}
/* pseries_idle_devices_init()
* allocate, initialize and register cpuidle device
*/
static int pseries_idle_devices_init(void)
{
int i;
struct cpuidle_driver *drv = &pseries_idle_driver;
struct cpuidle_device *dev;
pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device);
if (pseries_cpuidle_devices == NULL)
return -ENOMEM;
for_each_possible_cpu(i) {
dev = per_cpu_ptr(pseries_cpuidle_devices, i);
dev->state_count = drv->state_count;
dev->cpu = i;
if (cpuidle_register_device(dev)) {
printk(KERN_DEBUG \
"cpuidle_register_device %d failed!\n", i);
return -EIO;
}
}
return 0;
}
/*
* pseries_idle_probe()
* Choose state table for shared versus dedicated partition
*/
static int pseries_idle_probe(void)
{
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return -ENODEV;
if (cpuidle_disable != IDLE_NO_OVERRIDE)
return -ENODEV;
if (max_idle_state == 0) {
printk(KERN_DEBUG "pseries processor idle disabled.\n");
return -EPERM;
}
if (get_lppaca()->shared_proc)
cpuidle_state_table = shared_states;
else
cpuidle_state_table = dedicated_states;
return 0;
}
static int __init pseries_processor_idle_init(void)
{
int retval;
retval = pseries_idle_probe();
if (retval)
return retval;
pseries_cpuidle_driver_init();
retval = cpuidle_register_driver(&pseries_idle_driver);
if (retval) {
printk(KERN_DEBUG "Registration of pseries driver failed.\n");
return retval;
}
retval = pseries_idle_devices_init();
if (retval) {
pseries_idle_devices_uninit();
cpuidle_unregister_driver(&pseries_idle_driver);
return retval;
}
printk(KERN_DEBUG "pseries_idle_driver registered\n");
return 0;
}
static void __exit pseries_processor_idle_exit(void)
{
pseries_idle_devices_uninit();
cpuidle_unregister_driver(&pseries_idle_driver);
return;
}
module_init(pseries_processor_idle_init);
module_exit(pseries_processor_idle_exit);
MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("Cpuidle driver for POWER");
MODULE_LICENSE("GPL");
@@ -0,0 +1,63 @@
/*
* Copyright 2006 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _PSERIES_PSERIES_H
#define _PSERIES_PSERIES_H
#include <linux/interrupt.h>
struct device_node;
extern void request_event_sources_irqs(struct device_node *np,
irq_handler_t handler, const char *name);
#include <linux/of.h>
extern void __init fw_feature_init(const char *hypertas, unsigned long len);
struct pt_regs;
extern int pSeries_system_reset_exception(struct pt_regs *regs);
extern int pSeries_machine_check_exception(struct pt_regs *regs);
#ifdef CONFIG_SMP
extern void smp_init_pseries_mpic(void);
extern void smp_init_pseries_xics(void);
#else
static inline void smp_init_pseries_mpic(void) { };
static inline void smp_init_pseries_xics(void) { };
#endif
#ifdef CONFIG_KEXEC
extern void setup_kexec_cpu_down_xics(void);
extern void setup_kexec_cpu_down_mpic(void);
#else
static inline void setup_kexec_cpu_down_xics(void) { }
static inline void setup_kexec_cpu_down_mpic(void) { }
#endif
extern void pSeries_final_fixup(void);
/* Poweron flag used for enabling auto ups restart */
extern unsigned long rtas_poweron_auto;
/* Provided by HVC VIO */
extern void hvc_vio_init_early(void);
/* Dynamic logical Partitioning/Mobility */
extern void dlpar_free_cc_nodes(struct device_node *);
extern void dlpar_free_cc_property(struct property *);
extern struct device_node *dlpar_configure_connector(u32);
extern int dlpar_attach_node(struct device_node *);
extern int dlpar_detach_node(struct device_node *);
/* Snooze Delay, pseries_idle */
DECLARE_PER_CPU(long, smt_snooze_delay);
#endif /* _PSERIES_PSERIES_H */
@@ -0,0 +1,323 @@
/*
* POWER platform energy management driver
* Copyright (C) 2010 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This pseries platform device driver provides access to
* platform energy management capabilities.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/of.h>
#include <asm/cputhreads.h>
#include <asm/page.h>
#include <asm/hvcall.h>
#define MODULE_VERS "1.0"
#define MODULE_NAME "pseries_energy"
/* Driver flags */
static int sysfs_entries;
/* Helper routines */
/*
* Routine to detect firmware support for hcall
* return 1 if H_BEST_ENERGY is supported
* else return 0
*/
static int check_for_h_best_energy(void)
{
struct device_node *rtas = NULL;
const char *hypertas, *s;
int length;
int rc = 0;
rtas = of_find_node_by_path("/rtas");
if (!rtas)
return 0;
hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length);
if (!hypertas) {
of_node_put(rtas);
return 0;
}
/* hypertas will have list of strings with hcall names */
for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) {
if (!strncmp("hcall-best-energy-1", s, 19)) {
rc = 1; /* Found the string */
break;
}
}
of_node_put(rtas);
return rc;
}
/* Helper Routines to convert between drc_index to cpu numbers */
static u32 cpu_to_drc_index(int cpu)
{
struct device_node *dn = NULL;
const int *indexes;
int i;
int rc = 1;
u32 ret = 0;
dn = of_find_node_by_path("/cpus");
if (dn == NULL)
goto err;
indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
if (indexes == NULL)
goto err_of_node_put;
/* Convert logical cpu number to core number */
i = cpu_core_index_of_thread(cpu);
/*
* The first element indexes[0] is the number of drc_indexes
* returned in the list. Hence i+1 will get the drc_index
* corresponding to core number i.
*/
WARN_ON(i > indexes[0]);
ret = indexes[i + 1];
rc = 0;
err_of_node_put:
of_node_put(dn);
err:
if (rc)
printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
return ret;
}
static int drc_index_to_cpu(u32 drc_index)
{
struct device_node *dn = NULL;
const int *indexes;
int i, cpu = 0;
int rc = 1;
dn = of_find_node_by_path("/cpus");
if (dn == NULL)
goto err;
indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
if (indexes == NULL)
goto err_of_node_put;
/*
* First element in the array is the number of drc_indexes
* returned. Search through the list to find the matching
* drc_index and get the core number
*/
for (i = 0; i < indexes[0]; i++) {
if (indexes[i + 1] == drc_index)
break;
}
/* Convert core number to logical cpu number */
cpu = cpu_first_thread_of_core(i);
rc = 0;
err_of_node_put:
of_node_put(dn);
err:
if (rc)
printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
return cpu;
}
/*
* pseries hypervisor call H_BEST_ENERGY provides hints to OS on
* preferred logical cpus to activate or deactivate for optimized
* energy consumption.
*/
#define FLAGS_MODE1 0x004E200000080E01
#define FLAGS_MODE2 0x004E200000080401
#define FLAGS_ACTIVATE 0x100
static ssize_t get_best_energy_list(char *page, int activate)
{
int rc, cnt, i, cpu;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
unsigned long flags = 0;
u32 *buf_page;
char *s = page;
buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
if (!buf_page)
return -ENOMEM;
flags = FLAGS_MODE1;
if (activate)
flags |= FLAGS_ACTIVATE;
rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
0, 0, 0, 0, 0, 0);
if (rc != H_SUCCESS) {
free_page((unsigned long) buf_page);
return -EINVAL;
}
cnt = retbuf[0];
for (i = 0; i < cnt; i++) {
cpu = drc_index_to_cpu(buf_page[2*i+1]);
if ((cpu_online(cpu) && !activate) ||
(!cpu_online(cpu) && activate))
s += sprintf(s, "%d,", cpu);
}
if (s > page) { /* Something to show */
s--; /* Suppress last comma */
s += sprintf(s, "\n");
}
free_page((unsigned long) buf_page);
return s-page;
}
static ssize_t get_best_energy_data(struct device *dev,
char *page, int activate)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
unsigned long flags = 0;
flags = FLAGS_MODE2;
if (activate)
flags |= FLAGS_ACTIVATE;
rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
cpu_to_drc_index(dev->id),
0, 0, 0, 0, 0, 0, 0);
if (rc != H_SUCCESS)
return -EINVAL;
return sprintf(page, "%lu\n", retbuf[1] >> 32);
}
/* Wrapper functions */
static ssize_t cpu_activate_hint_list_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_list(page, 1);
}
static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_list(page, 0);
}
static ssize_t percpu_activate_hint_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_data(dev, page, 1);
}
static ssize_t percpu_deactivate_hint_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_data(dev, page, 0);
}
/*
* Create sysfs interface:
* /sys/devices/system/cpu/pseries_activate_hint_list
* /sys/devices/system/cpu/pseries_deactivate_hint_list
* Comma separated list of cpus to activate or deactivate
* /sys/devices/system/cpu/cpuN/pseries_activate_hint
* /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
* Per-cpu value of the hint
*/
struct device_attribute attr_cpu_activate_hint_list =
__ATTR(pseries_activate_hint_list, 0444,
cpu_activate_hint_list_show, NULL);
struct device_attribute attr_cpu_deactivate_hint_list =
__ATTR(pseries_deactivate_hint_list, 0444,
cpu_deactivate_hint_list_show, NULL);
struct device_attribute attr_percpu_activate_hint =
__ATTR(pseries_activate_hint, 0444,
percpu_activate_hint_show, NULL);
struct device_attribute attr_percpu_deactivate_hint =
__ATTR(pseries_deactivate_hint, 0444,
percpu_deactivate_hint_show, NULL);
static int __init pseries_energy_init(void)
{
int cpu, err;
struct device *cpu_dev;
if (!check_for_h_best_energy()) {
printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
return 0;
}
/* Create the sysfs files */
err = device_create_file(cpu_subsys.dev_root,
&attr_cpu_activate_hint_list);
if (!err)
err = device_create_file(cpu_subsys.dev_root,
&attr_cpu_deactivate_hint_list);
if (err)
return err;
for_each_possible_cpu(cpu) {
cpu_dev = get_cpu_device(cpu);
err = device_create_file(cpu_dev,
&attr_percpu_activate_hint);
if (err)
break;
err = device_create_file(cpu_dev,
&attr_percpu_deactivate_hint);
if (err)
break;
}
if (err)
return err;
sysfs_entries = 1; /* Removed entries on cleanup */
return 0;
}
static void __exit pseries_energy_cleanup(void)
{
int cpu;
struct device *cpu_dev;
if (!sysfs_entries)
return;
/* Remove the sysfs files */
device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
for_each_possible_cpu(cpu) {
cpu_dev = get_cpu_device(cpu);
sysfs_remove_file(&cpu_dev->kobj,
&attr_percpu_activate_hint.attr);
sysfs_remove_file(&cpu_dev->kobj,
&attr_percpu_deactivate_hint.attr);
}
}
module_init(pseries_energy_init);
module_exit(pseries_energy_cleanup);
MODULE_DESCRIPTION("Driver for pSeries platform energy management");
MODULE_AUTHOR("Vaidyanathan Srinivasan");
MODULE_LICENSE("GPL");
+408
View File
@@ -0,0 +1,408 @@
/*
* Copyright (C) 2001 Dave Engebretsen IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/of.h>
#include <linux/fs.h>
#include <linux/reboot.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/firmware.h>
#include "pseries.h"
static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(ras_log_buf_lock);
static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_PER_CPU(__u64, mce_data_buf);
static int ras_check_exception_token;
#define EPOW_SENSOR_TOKEN 9
#define EPOW_SENSOR_INDEX 0
static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
/*
* Initialize handlers for the set of interrupts caused by hardware errors
* and power system events.
*/
static int __init init_ras_IRQ(void)
{
struct device_node *np;
ras_check_exception_token = rtas_token("check-exception");
/* Internal Errors */
np = of_find_node_by_path("/event-sources/internal-errors");
if (np != NULL) {
request_event_sources_irqs(np, ras_error_interrupt,
"RAS_ERROR");
of_node_put(np);
}
/* EPOW Events */
np = of_find_node_by_path("/event-sources/epow-events");
if (np != NULL) {
request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
of_node_put(np);
}
return 0;
}
subsys_initcall(init_ras_IRQ);
#define EPOW_SHUTDOWN_NORMAL 1
#define EPOW_SHUTDOWN_ON_UPS 2
#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3
#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4
static void handle_system_shutdown(char event_modifier)
{
switch (event_modifier) {
case EPOW_SHUTDOWN_NORMAL:
pr_emerg("Firmware initiated power off");
orderly_poweroff(1);
break;
case EPOW_SHUTDOWN_ON_UPS:
pr_emerg("Loss of power reported by firmware, system is "
"running on UPS/battery");
break;
case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
pr_emerg("Loss of system critical functions reported by "
"firmware");
pr_emerg("Check RTAS error log for details");
orderly_poweroff(1);
break;
case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
pr_emerg("Ambient temperature too high reported by firmware");
pr_emerg("Check RTAS error log for details");
orderly_poweroff(1);
break;
default:
pr_err("Unknown power/cooling shutdown event (modifier %d)",
event_modifier);
}
}
struct epow_errorlog {
unsigned char sensor_value;
unsigned char event_modifier;
unsigned char extended_modifier;
unsigned char reserved;
unsigned char platform_reason;
};
#define EPOW_RESET 0
#define EPOW_WARN_COOLING 1
#define EPOW_WARN_POWER 2
#define EPOW_SYSTEM_SHUTDOWN 3
#define EPOW_SYSTEM_HALT 4
#define EPOW_MAIN_ENCLOSURE 5
#define EPOW_POWER_OFF 7
void rtas_parse_epow_errlog(struct rtas_error_log *log)
{
struct pseries_errorlog *pseries_log;
struct epow_errorlog *epow_log;
char action_code;
char modifier;
pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
if (pseries_log == NULL)
return;
epow_log = (struct epow_errorlog *)pseries_log->data;
action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */
modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */
switch (action_code) {
case EPOW_RESET:
pr_err("Non critical power or cooling issue cleared");
break;
case EPOW_WARN_COOLING:
pr_err("Non critical cooling issue reported by firmware");
pr_err("Check RTAS error log for details");
break;
case EPOW_WARN_POWER:
pr_err("Non critical power issue reported by firmware");
pr_err("Check RTAS error log for details");
break;
case EPOW_SYSTEM_SHUTDOWN:
handle_system_shutdown(epow_log->event_modifier);
break;
case EPOW_SYSTEM_HALT:
pr_emerg("Firmware initiated power off");
orderly_poweroff(1);
break;
case EPOW_MAIN_ENCLOSURE:
case EPOW_POWER_OFF:
pr_emerg("Critical power/cooling issue reported by firmware");
pr_emerg("Check RTAS error log for details");
pr_emerg("Immediate power off");
emergency_sync();
kernel_power_off();
break;
default:
pr_err("Unknown power/cooling event (action code %d)",
action_code);
}
}
/* Handle environmental and power warning (EPOW) interrupts. */
static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
{
int status;
int state;
int critical;
status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
if (state > 3)
critical = 1; /* Time Critical */
else
critical = 0;
spin_lock(&ras_log_buf_lock);
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_EPOW_WARNING,
critical, __pa(&ras_log_buf),
rtas_get_error_log_max());
log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
spin_unlock(&ras_log_buf_lock);
return IRQ_HANDLED;
}
/*
* Handle hardware error interrupts.
*
* RTAS check-exception is called to collect data on the exception. If
* the error is deemed recoverable, we log a warning and return.
* For nonrecoverable errors, an error is logged and we stop all processing
* as quickly as possible in order to prevent propagation of the failure.
*/
static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
{
struct rtas_error_log *rtas_elog;
int status;
int fatal;
spin_lock(&ras_log_buf_lock);
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
__pa(&ras_log_buf),
rtas_get_error_log_max());
rtas_elog = (struct rtas_error_log *)ras_log_buf;
if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
fatal = 1;
else
fatal = 0;
/* format and print the extended information */
log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
if (fatal) {
pr_emerg("Fatal hardware error reported by firmware");
pr_emerg("Check RTAS error log for details");
pr_emerg("Immediate power off");
emergency_sync();
kernel_power_off();
} else {
pr_err("Recoverable hardware error reported by firmware");
}
spin_unlock(&ras_log_buf_lock);
return IRQ_HANDLED;
}
/*
* Some versions of FWNMI place the buffer inside the 4kB page starting at
* 0x7000. Other versions place it inside the rtas buffer. We check both.
*/
#define VALID_FWNMI_BUFFER(A) \
((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
/*
* Get the error information for errors coming through the
* FWNMI vectors. The pt_regs' r3 will be updated to reflect
* the actual r3 if possible, and a ptr to the error log entry
* will be returned if found.
*
* If the RTAS error is not of the extended type, then we put it in a per
* cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
*
* The global_mce_data_buf does not have any locks or protection around it,
* if a second machine check comes in, or a system reset is done
* before we have logged the error, then we will get corruption in the
* error log. This is preferable over holding off on calling
* ibm,nmi-interlock which would result in us checkstopping if a
* second machine check did come in.
*/
static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
{
unsigned long *savep;
struct rtas_error_log *h, *errhdr = NULL;
if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
return NULL;
}
savep = __va(regs->gpr[3]);
regs->gpr[3] = savep[0]; /* restore original r3 */
/* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)&savep[1];
if (!h->extended) {
memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
} else {
int len;
len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;
}
return errhdr;
}
/* Call this when done with the data returned by FWNMI_get_errinfo.
* It will release the saved data area for other CPUs in the
* partition to receive FWNMI errors.
*/
static void fwnmi_release_errinfo(void)
{
int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
if (ret != 0)
printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
}
int pSeries_system_reset_exception(struct pt_regs *regs)
{
if (fwnmi_active) {
struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
if (errhdr) {
/* XXX Should look at FWNMI information */
}
fwnmi_release_errinfo();
}
return 0; /* need to perform reset */
}
/*
* See if we can recover from a machine check exception.
* This is only called on power4 (or above) and only via
* the Firmware Non-Maskable Interrupts (fwnmi) handler
* which provides the error analysis for us.
*
* Return 1 if corrected (or delivered a signal).
* Return 0 if there is nothing we can do.
*/
static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{
int recovered = 0;
if (!(regs->msr & MSR_RI)) {
/* If MSR_RI isn't set, we cannot recover */
recovered = 0;
} else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
/* Platform corrected itself */
recovered = 1;
} else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
/* Platform corrected itself but could be degraded */
printk(KERN_ERR "MCE: limited recovery, system may "
"be degraded\n");
recovered = 1;
} else if (user_mode(regs) && !is_global_init(current) &&
err->severity == RTAS_SEVERITY_ERROR_SYNC) {
/*
* If we received a synchronous error when in userspace
* kill the task. Firmware may report details of the fail
* asynchronously, so we can't rely on the target and type
* fields being valid here.
*/
printk(KERN_ERR "MCE: uncorrectable error, killing task "
"%s:%d\n", current->comm, current->pid);
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
recovered = 1;
}
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
return recovered;
}
/*
* Handle a machine check.
*
* Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
* should be present. If so the handler which called us tells us if the
* error was recovered (never true if RI=0).
*
* On hardware prior to Power 4 these exceptions were asynchronous which
* means we can't tell exactly where it occurred and so we can't recover.
*/
int pSeries_machine_check_exception(struct pt_regs *regs)
{
struct rtas_error_log *errp;
if (fwnmi_active) {
errp = fwnmi_get_errinfo(regs);
fwnmi_release_errinfo();
if (errp && recover_mce(regs, errp))
return 1;
}
return 0;
}
@@ -0,0 +1,564 @@
/*
* pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
* Hotplug and Dynamic Logical Partitioning on RPA platforms).
*
* Copyright (C) 2005 Nathan Lynch
* Copyright (C) 2005 IBM Corporation
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/uaccess.h>
#include <asm/pSeries_reconfig.h>
#include <asm/mmu.h>
/*
* Routines for "runtime" addition and removal of device tree nodes.
*/
#ifdef CONFIG_PROC_DEVICETREE
/*
* Add a node to /proc/device-tree.
*/
static void add_node_proc_entries(struct device_node *np)
{
struct proc_dir_entry *ent;
ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
if (ent)
proc_device_tree_add_node(np, ent);
}
static void remove_node_proc_entries(struct device_node *np)
{
struct property *pp = np->properties;
struct device_node *parent = np->parent;
while (pp) {
remove_proc_entry(pp->name, np->pde);
pp = pp->next;
}
if (np->pde)
remove_proc_entry(np->pde->name, parent->pde);
}
#else /* !CONFIG_PROC_DEVICETREE */
static void add_node_proc_entries(struct device_node *np)
{
return;
}
static void remove_node_proc_entries(struct device_node *np)
{
return;
}
#endif /* CONFIG_PROC_DEVICETREE */
/**
* derive_parent - basically like dirname(1)
* @path: the full_name of a node to be added to the tree
*
* Returns the node which should be the parent of the node
* described by path. E.g., for path = "/foo/bar", returns
* the node with full_name = "/foo".
*/
static struct device_node *derive_parent(const char *path)
{
struct device_node *parent = NULL;
char *parent_path = "/";
size_t parent_path_len = strrchr(path, '/') - path + 1;
/* reject if path is "/" */
if (!strcmp(path, "/"))
return ERR_PTR(-EINVAL);
if (strrchr(path, '/') != path) {
parent_path = kmalloc(parent_path_len, GFP_KERNEL);
if (!parent_path)
return ERR_PTR(-ENOMEM);
strlcpy(parent_path, path, parent_path_len);
}
parent = of_find_node_by_path(parent_path);
if (!parent)
return ERR_PTR(-EINVAL);
if (strcmp(parent_path, "/"))
kfree(parent_path);
return parent;
}
static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
int pSeries_reconfig_notifier_register(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
}
void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
{
blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
}
int pSeries_reconfig_notify(unsigned long action, void *p)
{
int err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
action, p);
return notifier_to_errno(err);
}
static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
{
struct device_node *np;
int err = -ENOMEM;
np = kzalloc(sizeof(*np), GFP_KERNEL);
if (!np)
goto out_err;
np->full_name = kstrdup(path, GFP_KERNEL);
if (!np->full_name)
goto out_err;
np->properties = proplist;
of_node_set_flag(np, OF_DYNAMIC);
kref_init(&np->kref);
np->parent = derive_parent(path);
if (IS_ERR(np->parent)) {
err = PTR_ERR(np->parent);
goto out_err;
}
err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np);
if (err) {
printk(KERN_ERR "Failed to add device node %s\n", path);
goto out_err;
}
of_attach_node(np);
add_node_proc_entries(np);
of_node_put(np->parent);
return 0;
out_err:
if (np) {
of_node_put(np->parent);
kfree(np->full_name);
kfree(np);
}
return err;
}
static int pSeries_reconfig_remove_node(struct device_node *np)
{
struct device_node *parent, *child;
parent = of_get_parent(np);
if (!parent)
return -EINVAL;
if ((child = of_get_next_child(np, NULL))) {
of_node_put(child);
of_node_put(parent);
return -EBUSY;
}
remove_node_proc_entries(np);
pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
of_detach_node(np);
of_node_put(parent);
of_node_put(np); /* Must decrement the refcount */
return 0;
}
/*
* /proc/powerpc/ofdt - yucky binary interface for adding and removing
* OF device nodes. Should be deprecated as soon as we get an
* in-kernel wrapper for the RTAS ibm,configure-connector call.
*/
static void release_prop_list(const struct property *prop)
{
struct property *next;
for (; prop; prop = next) {
next = prop->next;
kfree(prop->name);
kfree(prop->value);
kfree(prop);
}
}
/**
* parse_next_property - process the next property from raw input buffer
* @buf: input buffer, must be nul-terminated
* @end: end of the input buffer + 1, for validation
* @name: return value; set to property name in buf
* @length: return value; set to length of value
* @value: return value; set to the property value in buf
*
* Note that the caller must make copies of the name and value returned,
* this function does no allocation or copying of the data. Return value
* is set to the next name in buf, or NULL on error.
*/
static char * parse_next_property(char *buf, char *end, char **name, int *length,
unsigned char **value)
{
char *tmp;
*name = buf;
tmp = strchr(buf, ' ');
if (!tmp) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
*tmp = '\0';
if (++tmp >= end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
/* now we're on the length */
*length = -1;
*length = simple_strtoul(tmp, &tmp, 10);
if (*length == -1) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
if (*tmp != ' ' || ++tmp >= end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
/* now we're on the value */
*value = tmp;
tmp += *length;
if (tmp > end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
tmp++;
/* and now we should be on the next name, or the end */
return tmp;
}
static struct property *new_property(const char *name, const int length,
const unsigned char *value, struct property *last)
{
struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return NULL;
if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
goto cleanup;
if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
goto cleanup;
strcpy(new->name, name);
memcpy(new->value, value, length);
*(((char *)new->value) + length) = 0;
new->length = length;
new->next = last;
return new;
cleanup:
kfree(new->name);
kfree(new->value);
kfree(new);
return NULL;
}
static int do_add_node(char *buf, size_t bufsize)
{
char *path, *end, *name;
struct device_node *np;
struct property *prop = NULL;
unsigned char* value;
int length, rv = 0;
end = buf + bufsize;
path = buf;
buf = strchr(buf, ' ');
if (!buf)
return -EINVAL;
*buf = '\0';
buf++;
if ((np = of_find_node_by_path(path))) {
of_node_put(np);
return -EINVAL;
}
/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
while (buf < end &&
(buf = parse_next_property(buf, end, &name, &length, &value))) {
struct property *last = prop;
prop = new_property(name, length, value, last);
if (!prop) {
rv = -ENOMEM;
prop = last;
goto out;
}
}
if (!buf) {
rv = -EINVAL;
goto out;
}
rv = pSeries_reconfig_add_node(path, prop);
out:
if (rv)
release_prop_list(prop);
return rv;
}
static int do_remove_node(char *buf)
{
struct device_node *node;
int rv = -ENODEV;
if ((node = of_find_node_by_path(buf)))
rv = pSeries_reconfig_remove_node(node);
of_node_put(node);
return rv;
}
static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
{
char *handle_str;
phandle handle;
*npp = NULL;
handle_str = buf;
buf = strchr(buf, ' ');
if (!buf)
return NULL;
*buf = '\0';
buf++;
handle = simple_strtoul(handle_str, NULL, 0);
*npp = of_find_node_by_phandle(handle);
return buf;
}
static int do_add_property(char *buf, size_t bufsize)
{
struct property *prop = NULL;
struct device_node *np;
unsigned char *value;
char *name, *end;
int length;
end = buf + bufsize;
buf = parse_node(buf, bufsize, &np);
if (!np)
return -ENODEV;
if (parse_next_property(buf, end, &name, &length, &value) == NULL)
return -EINVAL;
prop = new_property(name, length, value, NULL);
if (!prop)
return -ENOMEM;
prom_add_property(np, prop);
return 0;
}
static int do_remove_property(char *buf, size_t bufsize)
{
struct device_node *np;
char *tmp;
struct property *prop;
buf = parse_node(buf, bufsize, &np);
if (!np)
return -ENODEV;
tmp = strchr(buf,' ');
if (tmp)
*tmp = '\0';
if (strlen(buf) == 0)
return -EINVAL;
prop = of_find_property(np, buf, NULL);
return prom_remove_property(np, prop);
}
static int do_update_property(char *buf, size_t bufsize)
{
struct device_node *np;
unsigned char *value;
char *name, *end, *next_prop;
int rc, length;
struct property *newprop, *oldprop;
buf = parse_node(buf, bufsize, &np);
end = buf + bufsize;
if (!np)
return -ENODEV;
next_prop = parse_next_property(buf, end, &name, &length, &value);
if (!next_prop)
return -EINVAL;
newprop = new_property(name, length, value, NULL);
if (!newprop)
return -ENOMEM;
if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
slb_set_size(*(int *)value);
oldprop = of_find_property(np, name,NULL);
if (!oldprop) {
if (strlen(name))
return prom_add_property(np, newprop);
return -ENODEV;
}
rc = prom_update_property(np, newprop, oldprop);
if (rc)
return rc;
/* For memory under the ibm,dynamic-reconfiguration-memory node
* of the device tree, adding and removing memory is just an update
* to the ibm,dynamic-memory property instead of adding/removing a
* memory node in the device tree. For these cases we still need to
* involve the notifier chain.
*/
if (!strcmp(name, "ibm,dynamic-memory")) {
int action;
next_prop = parse_next_property(next_prop, end, &name,
&length, &value);
if (!next_prop)
return -EINVAL;
if (!strcmp(name, "add"))
action = PSERIES_DRCONF_MEM_ADD;
else
action = PSERIES_DRCONF_MEM_REMOVE;
rc = pSeries_reconfig_notify(action, value);
if (rc) {
prom_update_property(np, oldprop, newprop);
return rc;
}
}
return 0;
}
/**
* ofdt_write - perform operations on the Open Firmware device tree
*
* @file: not used
* @buf: command and arguments
* @count: size of the command buffer
* @off: not used
*
* Operations supported at this time are addition and removal of
* whole nodes along with their properties. Operations on individual
* properties are not implemented (yet).
*/
static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
loff_t *off)
{
int rv = 0;
char *kbuf;
char *tmp;
if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
rv = -ENOMEM;
goto out;
}
if (copy_from_user(kbuf, buf, count)) {
rv = -EFAULT;
goto out;
}
kbuf[count] = '\0';
tmp = strchr(kbuf, ' ');
if (!tmp) {
rv = -EINVAL;
goto out;
}
*tmp = '\0';
tmp++;
if (!strcmp(kbuf, "add_node"))
rv = do_add_node(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "remove_node"))
rv = do_remove_node(tmp);
else if (!strcmp(kbuf, "add_property"))
rv = do_add_property(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "remove_property"))
rv = do_remove_property(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "update_property"))
rv = do_update_property(tmp, count - (tmp - kbuf));
else
rv = -EINVAL;
out:
kfree(kbuf);
return rv ? rv : count;
}
static const struct file_operations ofdt_fops = {
.write = ofdt_write,
.llseek = noop_llseek,
};
/* create /proc/powerpc/ofdt write-only by root */
static int proc_ppc64_create_ofdt(void)
{
struct proc_dir_entry *ent;
if (!machine_is(pseries))
return 0;
ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
if (ent)
ent->size = 0;
return 0;
}
__initcall(proc_ppc64_create_ofdt);
@@ -0,0 +1,214 @@
/*
* c 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com>
*
* When ppc64 hardware fails the service processor dumps internal state
* of the system. After a reboot the operating system can access a dump
* of this data using this driver. A dump exists if the device-tree
* /chosen/ibm,scan-log-data property exists.
*
* This driver exports /proc/powerpc/scan-log-dump which can be read.
* The driver supports only sequential reads.
*
* The driver looks at a write to the driver for the single word "reset".
* If given, the driver will reset the scanlog so the platform can free it.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/rtas.h>
#include <asm/prom.h>
#define MODULE_VERS "1.0"
#define MODULE_NAME "scanlog"
/* Status returns from ibm,scan-log-dump */
#define SCANLOG_COMPLETE 0
#define SCANLOG_HWERROR -1
#define SCANLOG_CONTINUE 1
static unsigned int ibm_scan_log_dump; /* RTAS token */
static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */
static ssize_t scanlog_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct proc_dir_entry *dp;
unsigned int *data;
int status;
unsigned long len, off;
unsigned int wait_time;
dp = PDE(inode);
data = (unsigned int *)dp->data;
if (count > RTAS_DATA_BUF_SIZE)
count = RTAS_DATA_BUF_SIZE;
if (count < 1024) {
/* This is the min supported by this RTAS call. Rather
* than do all the buffering we insist the user code handle
* larger reads. As long as cp works... :)
*/
printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
return -EINVAL;
}
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
for (;;) {
wait_time = 500; /* default wait if no data */
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
(u32) __pa(rtas_data_buf), (u32) count);
memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
"data[2]=%x\n", status, data[0], data[1], data[2]);
switch (status) {
case SCANLOG_COMPLETE:
pr_debug("scanlog: hit eof\n");
return 0;
case SCANLOG_HWERROR:
pr_debug("scanlog: hardware error reading data\n");
return -EIO;
case SCANLOG_CONTINUE:
/* We may or may not have data yet */
len = data[1];
off = data[2];
if (len > 0) {
if (copy_to_user(buf, ((char *)data)+off, len))
return -EFAULT;
return len;
}
/* Break to sleep default time */
break;
default:
/* Assume extended busy */
wait_time = rtas_busy_delay_time(status);
if (!wait_time) {
printk(KERN_ERR "scanlog: unknown error " \
"from rtas: %d\n", status);
return -EIO;
}
}
/* Apparently no data yet. Wait and try again. */
msleep_interruptible(wait_time);
}
/*NOTREACHED*/
}
static ssize_t scanlog_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
char stkbuf[20];
int status;
if (count > 19) count = 19;
if (copy_from_user (stkbuf, buf, count)) {
return -EFAULT;
}
stkbuf[count] = 0;
if (buf) {
if (strncmp(stkbuf, "reset", 5) == 0) {
pr_debug("scanlog: reset scanlog\n");
status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
pr_debug("scanlog: rtas returns %d\n", status);
}
}
return count;
}
static int scanlog_open(struct inode * inode, struct file * file)
{
struct proc_dir_entry *dp = PDE(inode);
unsigned int *data = (unsigned int *)dp->data;
if (data[0] != 0) {
/* This imperfect test stops a second copy of the
* data (or a reset while data is being copied)
*/
return -EBUSY;
}
data[0] = 0; /* re-init so we restart the scan */
return 0;
}
static int scanlog_release(struct inode * inode, struct file * file)
{
struct proc_dir_entry *dp = PDE(inode);
unsigned int *data = (unsigned int *)dp->data;
data[0] = 0;
return 0;
}
const struct file_operations scanlog_fops = {
.owner = THIS_MODULE,
.read = scanlog_read,
.write = scanlog_write,
.open = scanlog_open,
.release = scanlog_release,
.llseek = noop_llseek,
};
static int __init scanlog_init(void)
{
struct proc_dir_entry *ent;
void *data;
int err = -ENOMEM;
ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
/* Ideally we could allocate a buffer < 4G */
data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!data)
goto err;
ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
&scanlog_fops, data);
if (!ent)
goto err;
proc_ppc64_scan_log_dump = ent;
return 0;
err:
kfree(data);
return err;
}
static void __exit scanlog_cleanup(void)
{
if (proc_ppc64_scan_log_dump) {
kfree(proc_ppc64_scan_log_dump->data);
remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
}
}
module_init(scanlog_init);
module_exit(scanlog_cleanup);
MODULE_LICENSE("GPL");
@@ -0,0 +1,660 @@
/*
* 64-bit pSeries and RS/6000 setup code.
*
* Copyright (C) 1995 Linus Torvalds
* Adapted from 'alpha' version by Gary Thomas
* Modified by Cort Dougan (cort@cs.nmt.edu)
* Modified by PPC64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/*
* bootup setup stuff..
*/
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/user.h>
#include <linux/tty.h>
#include <linux/major.h>
#include <linux/interrupt.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/utsname.h>
#include <linux/adb.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
#include <linux/cpuidle.h>
#include <asm/mmu.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/pci-bridge.h>
#include <asm/iommu.h>
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/irq.h>
#include <asm/time.h>
#include <asm/nvram.h>
#include <asm/pmc.h>
#include <asm/mpic.h>
#include <asm/xics.h>
#include <asm/ppc-pci.h>
#include <asm/i8259.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/firmware.h>
#include <asm/eeh.h>
#include <asm/pSeries_reconfig.h>
#include "plpar_wrappers.h"
#include "pseries.h"
int CMO_PrPSP = -1;
int CMO_SecPSP = -1;
unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT);
EXPORT_SYMBOL(CMO_PageSize);
int fwnmi_active; /* TRUE if an FWNMI handler is present */
static struct device_node *pSeries_mpic_node;
static void pSeries_show_cpuinfo(struct seq_file *m)
{
struct device_node *root;
const char *model = "";
root = of_find_node_by_path("/");
if (root)
model = of_get_property(root, "model", NULL);
seq_printf(m, "machine\t\t: CHRP %s\n", model);
of_node_put(root);
}
/* Initialize firmware assisted non-maskable interrupts if
* the firmware supports this feature.
*/
static void __init fwnmi_init(void)
{
unsigned long system_reset_addr, machine_check_addr;
int ibm_nmi_register = rtas_token("ibm,nmi-register");
if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
return;
/* If the kernel's not linked at zero we point the firmware at low
* addresses anyway, and use a trampoline to get to the real code. */
system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START;
machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
machine_check_addr))
fwnmi_active = 1;
}
static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
unsigned int cascade_irq = i8259_irq();
if (cascade_irq != NO_IRQ)
generic_handle_irq(cascade_irq);
chip->irq_eoi(&desc->irq_data);
}
static void __init pseries_setup_i8259_cascade(void)
{
struct device_node *np, *old, *found = NULL;
unsigned int cascade;
const u32 *addrp;
unsigned long intack = 0;
int naddr;
for_each_node_by_type(np, "interrupt-controller") {
if (of_device_is_compatible(np, "chrp,iic")) {
found = np;
break;
}
}
if (found == NULL) {
printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
return;
}
cascade = irq_of_parse_and_map(found, 0);
if (cascade == NO_IRQ) {
printk(KERN_ERR "pic: failed to map cascade interrupt");
return;
}
pr_debug("pic: cascade mapped to irq %d\n", cascade);
for (old = of_node_get(found); old != NULL ; old = np) {
np = of_get_parent(old);
of_node_put(old);
if (np == NULL)
break;
if (strcmp(np->name, "pci") != 0)
continue;
addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
if (addrp == NULL)
continue;
naddr = of_n_addr_cells(np);
intack = addrp[naddr-1];
if (naddr > 1)
intack |= ((unsigned long)addrp[naddr-2]) << 32;
}
if (intack)
printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
i8259_init(found, intack);
of_node_put(found);
irq_set_chained_handler(cascade, pseries_8259_cascade);
}
static void __init pseries_mpic_init_IRQ(void)
{
struct device_node *np;
const unsigned int *opprop;
unsigned long openpic_addr = 0;
int naddr, n, i, opplen;
struct mpic *mpic;
np = of_find_node_by_path("/");
naddr = of_n_addr_cells(np);
opprop = of_get_property(np, "platform-open-pic", &opplen);
if (opprop != 0) {
openpic_addr = of_read_number(opprop, naddr);
printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
}
of_node_put(np);
BUG_ON(openpic_addr == 0);
/* Setup the openpic driver */
mpic = mpic_alloc(pSeries_mpic_node, openpic_addr,
MPIC_NO_RESET, 16, 0, " MPIC ");
BUG_ON(mpic == NULL);
/* Add ISUs */
opplen /= sizeof(u32);
for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
unsigned long isuaddr = of_read_number(opprop + i, naddr);
mpic_assign_isu(mpic, n, isuaddr);
}
/* Setup top-level get_irq */
ppc_md.get_irq = mpic_get_irq;
/* All ISUs are setup, complete initialization */
mpic_init(mpic);
/* Look for cascade */
pseries_setup_i8259_cascade();
}
static void __init pseries_xics_init_IRQ(void)
{
xics_init();
pseries_setup_i8259_cascade();
}
static void pseries_lpar_enable_pmcs(void)
{
unsigned long set, reset;
set = 1UL << 63;
reset = 0;
plpar_hcall_norets(H_PERFMON, set, reset);
}
static void __init pseries_discover_pic(void)
{
struct device_node *np;
const char *typep;
for (np = NULL; (np = of_find_node_by_name(np,
"interrupt-controller"));) {
typep = of_get_property(np, "compatible", NULL);
if (strstr(typep, "open-pic")) {
pSeries_mpic_node = of_node_get(np);
ppc_md.init_IRQ = pseries_mpic_init_IRQ;
setup_kexec_cpu_down_mpic();
smp_init_pseries_mpic();
return;
} else if (strstr(typep, "ppc-xicp")) {
ppc_md.init_IRQ = pseries_xics_init_IRQ;
setup_kexec_cpu_down_xics();
smp_init_pseries_xics();
return;
}
}
printk(KERN_ERR "pSeries_discover_pic: failed to recognize"
" interrupt-controller\n");
}
static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
{
struct device_node *np = node;
struct pci_dn *pci = NULL;
int err = NOTIFY_OK;
switch (action) {
case PSERIES_RECONFIG_ADD:
pci = np->parent->data;
if (pci) {
update_dn_pci_info(np, pci->phb);
/* Create EEH device for the OF node */
eeh_dev_init(np, pci->phb);
}
break;
default:
err = NOTIFY_DONE;
break;
}
return err;
}
static struct notifier_block pci_dn_reconfig_nb = {
.notifier_call = pci_dn_reconfig_notifier,
};
struct kmem_cache *dtl_cache;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
/*
* Allocate space for the dispatch trace log for all possible cpus
* and register the buffers with the hypervisor. This is used for
* computing time stolen by the hypervisor.
*/
static int alloc_dispatch_logs(void)
{
int cpu, ret;
struct paca_struct *pp;
struct dtl_entry *dtl;
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return 0;
if (!dtl_cache)
return 0;
for_each_possible_cpu(cpu) {
pp = &paca[cpu];
dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
if (!dtl) {
pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
cpu);
pr_warn("Stolen time statistics will be unreliable\n");
break;
}
pp->dtl_ridx = 0;
pp->dispatch_log = dtl;
pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
pp->dtl_curr = dtl;
}
/* Register the DTL for the current (boot) cpu */
dtl = get_paca()->dispatch_log;
get_paca()->dtl_ridx = 0;
get_paca()->dtl_curr = dtl;
get_paca()->lppaca_ptr->dtl_idx = 0;
/* hypervisor reads buffer length from this field */
dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
if (ret)
pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
"with %d\n", smp_processor_id(),
hard_smp_processor_id(), ret);
get_paca()->lppaca_ptr->dtl_enable_mask = 2;
return 0;
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline int alloc_dispatch_logs(void)
{
return 0;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
static int alloc_dispatch_log_kmem_cache(void)
{
dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
DISPATCH_LOG_BYTES, 0, NULL);
if (!dtl_cache) {
pr_warn("Failed to create dispatch trace log buffer cache\n");
pr_warn("Stolen time statistics will be unreliable\n");
return 0;
}
return alloc_dispatch_logs();
}
early_initcall(alloc_dispatch_log_kmem_cache);
static void pSeries_idle(void)
{
/* This would call on the cpuidle framework, and the back-end pseries
* driver to go to idle states
*/
if (cpuidle_idle_call()) {
/* On error, execute default handler
* to go into low thread priority and possibly
* low power mode.
*/
HMT_low();
HMT_very_low();
}
}
static void __init pSeries_setup_arch(void)
{
panic_timeout = 10;
/* Discover PIC type and setup ppc_md accordingly */
pseries_discover_pic();
/* openpic global configuration register (64-bit format). */
/* openpic Interrupt Source Unit pointer (64-bit format). */
/* python0 facility area (mmio) (64-bit format) REAL address. */
/* init to some ~sane value until calibrate_delay() runs */
loops_per_jiffy = 50000000;
fwnmi_init();
/* By default, only probe PCI (can be overriden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
/* Find and initialize PCI host bridges */
init_pci_config_tokens();
eeh_pseries_init();
find_and_init_phbs();
pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
eeh_init();
pSeries_nvram_init();
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
vpa_init(boot_cpuid);
ppc_md.power_save = pSeries_idle;
}
if (firmware_has_feature(FW_FEATURE_LPAR))
ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
else
ppc_md.enable_pmcs = power4_enable_pmcs;
}
static int __init pSeries_init_panel(void)
{
/* Manually leave the kernel version on the panel. */
ppc_md.progress("Linux ppc64\n", 0);
ppc_md.progress(init_utsname()->version, 0);
return 0;
}
machine_arch_initcall(pseries, pSeries_init_panel);
static int pseries_set_dabr(unsigned long dabr)
{
return plpar_hcall_norets(H_SET_DABR, dabr);
}
static int pseries_set_xdabr(unsigned long dabr)
{
/* We want to catch accesses from kernel and userspace */
return plpar_hcall_norets(H_SET_XDABR, dabr,
H_DABRX_KERNEL | H_DABRX_USER);
}
#define CMO_CHARACTERISTICS_TOKEN 44
#define CMO_MAXLENGTH 1026
void pSeries_coalesce_init(void)
{
struct hvcall_mpp_x_data mpp_x_data;
if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
powerpc_firmware_features |= FW_FEATURE_XCMO;
else
powerpc_firmware_features &= ~FW_FEATURE_XCMO;
}
/**
* fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
* handle that here. (Stolen from parse_system_parameter_string)
*/
void pSeries_cmo_feature_init(void)
{
char *ptr, *key, *value, *end;
int call_status;
int page_order = IOMMU_PAGE_SHIFT;
pr_debug(" -> fw_cmo_feature_init()\n");
spin_lock(&rtas_data_buf_lock);
memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
CMO_CHARACTERISTICS_TOKEN,
__pa(rtas_data_buf),
RTAS_DATA_BUF_SIZE);
if (call_status != 0) {
spin_unlock(&rtas_data_buf_lock);
pr_debug("CMO not available\n");
pr_debug(" <- fw_cmo_feature_init()\n");
return;
}
end = rtas_data_buf + CMO_MAXLENGTH - 2;
ptr = rtas_data_buf + 2; /* step over strlen value */
key = value = ptr;
while (*ptr && (ptr <= end)) {
/* Separate the key and value by replacing '=' with '\0' and
* point the value at the string after the '='
*/
if (ptr[0] == '=') {
ptr[0] = '\0';
value = ptr + 1;
} else if (ptr[0] == '\0' || ptr[0] == ',') {
/* Terminate the string containing the key/value pair */
ptr[0] = '\0';
if (key == value) {
pr_debug("Malformed key/value pair\n");
/* Never found a '=', end processing */
break;
}
if (0 == strcmp(key, "CMOPageSize"))
page_order = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "PrPSP"))
CMO_PrPSP = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "SecPSP"))
CMO_SecPSP = simple_strtol(value, NULL, 10);
value = key = ptr + 1;
}
ptr++;
}
/* Page size is returned as the power of 2 of the page size,
* convert to the page size in bytes before returning
*/
CMO_PageSize = 1 << page_order;
pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
pr_info("CMO enabled\n");
pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
powerpc_firmware_features |= FW_FEATURE_CMO;
pSeries_coalesce_init();
} else
pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
spin_unlock(&rtas_data_buf_lock);
pr_debug(" <- fw_cmo_feature_init()\n");
}
/*
* Early initialization. Relocation is on but do not reference unbolted pages
*/
static void __init pSeries_init_early(void)
{
pr_debug(" -> pSeries_init_early()\n");
#ifdef CONFIG_HVC_CONSOLE
if (firmware_has_feature(FW_FEATURE_LPAR))
hvc_vio_init_early();
#endif
if (firmware_has_feature(FW_FEATURE_DABR))
ppc_md.set_dabr = pseries_set_dabr;
else if (firmware_has_feature(FW_FEATURE_XDABR))
ppc_md.set_dabr = pseries_set_xdabr;
pSeries_cmo_feature_init();
iommu_init_early_pSeries();
pr_debug(" <- pSeries_init_early()\n");
}
/*
* Called very early, MMU is off, device-tree isn't unflattened
*/
static int __init pSeries_probe_hypertas(unsigned long node,
const char *uname, int depth,
void *data)
{
const char *hypertas;
unsigned long len;
if (depth != 1 ||
(strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0))
return 0;
hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len);
if (!hypertas)
return 1;
powerpc_firmware_features |= FW_FEATURE_LPAR;
fw_feature_init(hypertas, len);
return 1;
}
static int __init pSeries_probe(void)
{
unsigned long root = of_get_flat_dt_root();
char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
if (dtype == NULL)
return 0;
if (strcmp(dtype, "chrp"))
return 0;
/* Cell blades firmware claims to be chrp while it's not. Until this
* is fixed, we need to avoid those here.
*/
if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") ||
of_flat_dt_is_compatible(root, "IBM,CBEA"))
return 0;
pr_debug("pSeries detected, looking for LPAR capability...\n");
/* Now try to figure out if we are running on LPAR */
of_scan_flat_dt(pSeries_probe_hypertas, NULL);
if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_lpar();
else
hpte_init_native();
pr_debug("Machine is%s LPAR !\n",
(powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
return 1;
}
static int pSeries_pci_probe_mode(struct pci_bus *bus)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
return PCI_PROBE_DEVTREE;
return PCI_PROBE_NORMAL;
}
/**
* pSeries_power_off - tell firmware about how to power off the system.
*
* This function calls either the power-off rtas token in normal cases
* or the ibm,power-off-ups token (if present & requested) in case of
* a power failure. If power-off token is used, power on will only be
* possible with power button press. If ibm,power-off-ups token is used
* it will allow auto poweron after power is restored.
*/
static void pSeries_power_off(void)
{
int rc;
int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
if (rtas_flash_term_hook)
rtas_flash_term_hook(SYS_POWER_OFF);
if (rtas_poweron_auto == 0 ||
rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
printk(KERN_INFO "RTAS power-off returned %d\n", rc);
} else {
rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
}
for (;;);
}
#ifndef CONFIG_PCI
void pSeries_final_fixup(void) { }
#endif
define_machine(pseries) {
.name = "pSeries",
.probe = pSeries_probe,
.setup_arch = pSeries_setup_arch,
.init_early = pSeries_init_early,
.show_cpuinfo = pSeries_show_cpuinfo,
.log_error = pSeries_log_error,
.pcibios_fixup = pSeries_final_fixup,
.pci_probe_mode = pSeries_pci_probe_mode,
.restart = rtas_restart,
.power_off = pSeries_power_off,
.halt = rtas_halt,
.panic = rtas_os_term,
.get_boot_time = rtas_get_boot_time,
.get_rtc_time = rtas_get_rtc_time,
.set_rtc_time = rtas_set_rtc_time,
.calibrate_decr = generic_calibrate_decr,
.progress = rtas_progress,
.system_reset_exception = pSeries_system_reset_exception,
.machine_check_exception = pSeries_machine_check_exception,
};
+258
View File
@@ -0,0 +1,258 @@
/*
* SMP support for pSeries machines.
*
* Dave Engebretsen, Peter Bergner, and
* Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
*
* Plus various changes from other IBM teams...
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <asm/ptrace.h>
#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/smp.h>
#include <asm/paca.h>
#include <asm/machdep.h>
#include <asm/cputable.h>
#include <asm/firmware.h>
#include <asm/rtas.h>
#include <asm/pSeries_reconfig.h>
#include <asm/mpic.h>
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
#include "plpar_wrappers.h"
#include "pseries.h"
#include "offline_states.h"
/*
* The Primary thread of each non-boot processor was started from the OF client
* interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
*/
static cpumask_var_t of_spin_mask;
/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
int smp_query_cpu_stopped(unsigned int pcpu)
{
int cpu_status, status;
int qcss_tok = rtas_token("query-cpu-stopped-state");
if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk_once(KERN_INFO
"Firmware doesn't support query-cpu-stopped-state\n");
return QCSS_HARDWARE_ERROR;
}
status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
if (status != 0) {
printk(KERN_ERR
"RTAS query-cpu-stopped-state failed: %i\n", status);
return status;
}
return cpu_status;
}
/**
* smp_startup_cpu() - start the given cpu
*
* At boot time, there is nothing to do for primary threads which were
* started from Open Firmware. For anything else, call RTAS with the
* appropriate start location.
*
* Returns:
* 0 - failure
* 1 - success
*/
static inline int __devinit smp_startup_cpu(unsigned int lcpu)
{
int status;
unsigned long start_here = __pa((u32)*((unsigned long *)
generic_secondary_smp_init));
unsigned int pcpu;
int start_cpu;
if (cpumask_test_cpu(lcpu, of_spin_mask))
/* Already started by OF and sitting in spin loop */
return 1;
pcpu = get_hard_smp_processor_id(lcpu);
/* Check to see if the CPU out of FW already for kexec */
if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
cpumask_set_cpu(lcpu, of_spin_mask);
return 1;
}
/* Fixup atomic count: it exited inside IRQ handler. */
task_thread_info(paca[lcpu].__current)->preempt_count = 0;
#ifdef CONFIG_HOTPLUG_CPU
if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
goto out;
#endif
/*
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.
*/
start_cpu = rtas_token("start-cpu");
if (start_cpu == RTAS_UNKNOWN_SERVICE)
return 1;
status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
if (status != 0) {
printk(KERN_ERR "start-cpu failed: %i\n", status);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
out:
#endif
return 1;
}
static void __devinit smp_xics_setup_cpu(int cpu)
{
if (cpu != boot_cpuid)
xics_setup_cpu();
if (firmware_has_feature(FW_FEATURE_SPLPAR))
vpa_init(cpu);
cpumask_clear_cpu(cpu, of_spin_mask);
#ifdef CONFIG_HOTPLUG_CPU
set_cpu_current_state(cpu, CPU_STATE_ONLINE);
set_default_offline_state(cpu);
#endif
pseries_notify_cpuidle_add_cpu(cpu);
}
static int __devinit smp_pSeries_kick_cpu(int nr)
{
BUG_ON(nr < 0 || nr >= NR_CPUS);
if (!smp_startup_cpu(nr))
return -ENOENT;
/*
* The processor is currently spinning, waiting for the
* cpu_start field to become non-zero After we set cpu_start,
* the processor will continue on to secondary_start
*/
paca[nr].cpu_start = 1;
#ifdef CONFIG_HOTPLUG_CPU
set_preferred_offline_state(nr, CPU_STATE_ONLINE);
if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
long rc;
unsigned long hcpuid;
hcpuid = get_hard_smp_processor_id(nr);
rc = plpar_hcall_norets(H_PROD, hcpuid);
if (rc != H_SUCCESS)
printk(KERN_ERR "Error: Prod to wake up processor %d "
"Ret= %ld\n", nr, rc);
}
#endif
return 0;
}
static int smp_pSeries_cpu_bootable(unsigned int nr)
{
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
&& cpu_thread_in_core(nr) >= smt_enabled_at_boot)
return 0;
}
return 1;
}
static struct smp_ops_t pSeries_mpic_smp_ops = {
.message_pass = smp_mpic_message_pass,
.probe = smp_mpic_probe,
.kick_cpu = smp_pSeries_kick_cpu,
.setup_cpu = smp_mpic_setup_cpu,
};
static struct smp_ops_t pSeries_xics_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
.probe = xics_smp_probe,
.kick_cpu = smp_pSeries_kick_cpu,
.setup_cpu = smp_xics_setup_cpu,
.cpu_bootable = smp_pSeries_cpu_bootable,
};
/* This is called very early */
static void __init smp_init_pseries(void)
{
int i;
pr_debug(" -> smp_init_pSeries()\n");
alloc_bootmem_cpumask_var(&of_spin_mask);
/* Mark threads which are still spinning in hold loops. */
if (cpu_has_feature(CPU_FTR_SMT)) {
for_each_present_cpu(i) {
if (cpu_thread_in_core(i) == 0)
cpumask_set_cpu(i, of_spin_mask);
}
} else {
cpumask_copy(of_spin_mask, cpu_present_mask);
}
cpumask_clear_cpu(boot_cpuid, of_spin_mask);
/* Non-lpar has additional take/give timebase */
if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
smp_ops->give_timebase = rtas_give_timebase;
smp_ops->take_timebase = rtas_take_timebase;
}
pr_debug(" <- smp_init_pSeries()\n");
}
void __init smp_init_pseries_mpic(void)
{
smp_ops = &pSeries_mpic_smp_ops;
smp_init_pseries();
}
void __init smp_init_pseries_xics(void)
{
smp_ops = &pSeries_xics_smp_ops;
smp_init_pseries();
}
@@ -0,0 +1,220 @@
/*
* Copyright (C) 2010 Brian King IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/delay.h>
#include <linux/suspend.h>
#include <linux/stat.h>
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/rtas.h>
#include <asm/topology.h>
static u64 stream_id;
static struct device suspend_dev;
static DECLARE_COMPLETION(suspend_work);
static struct rtas_suspend_me_data suspend_data;
static atomic_t suspending;
/**
* pseries_suspend_begin - First phase of hibernation
*
* Check to ensure we are in a valid state to hibernate
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_begin(suspend_state_t state)
{
long vasi_state, rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
/* Make sure the state is valid */
rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
vasi_state = retbuf[0];
if (rc) {
pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
return rc;
} else if (vasi_state == H_VASI_ENABLED) {
return -EAGAIN;
} else if (vasi_state != H_VASI_SUSPENDING) {
pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
vasi_state);
return -EIO;
}
return 0;
}
/**
* pseries_suspend_cpu - Suspend a single CPU
*
* Makes the H_JOIN call to suspend the CPU
*
**/
static int pseries_suspend_cpu(void)
{
if (atomic_read(&suspending))
return rtas_suspend_cpu(&suspend_data);
return 0;
}
/**
* pseries_suspend_enter - Final phase of hibernation
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_enter(suspend_state_t state)
{
int rc = rtas_suspend_last_cpu(&suspend_data);
atomic_set(&suspending, 0);
atomic_set(&suspend_data.done, 1);
return rc;
}
/**
* pseries_prepare_late - Prepare to suspend all other CPUs
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_prepare_late(void)
{
atomic_set(&suspending, 1);
atomic_set(&suspend_data.working, 0);
atomic_set(&suspend_data.done, 0);
atomic_set(&suspend_data.error, 0);
suspend_data.complete = &suspend_work;
INIT_COMPLETION(suspend_work);
return 0;
}
/**
* store_hibernate - Initiate partition hibernation
* @dev: subsys root device
* @attr: device attribute struct
* @buf: buffer
* @count: buffer size
*
* Write the stream ID received from the HMC to this file
* to trigger hibernating the partition
*
* Return value:
* number of bytes printed to buffer / other on failure
**/
static ssize_t store_hibernate(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int rc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
stream_id = simple_strtoul(buf, NULL, 16);
do {
rc = pseries_suspend_begin(PM_SUSPEND_MEM);
if (rc == -EAGAIN)
ssleep(1);
} while (rc == -EAGAIN);
if (!rc) {
stop_topology_update();
rc = pm_suspend(PM_SUSPEND_MEM);
start_topology_update();
}
stream_id = 0;
if (!rc)
rc = count;
return rc;
}
static DEVICE_ATTR(hibernate, S_IWUSR, NULL, store_hibernate);
static struct bus_type suspend_subsys = {
.name = "power",
.dev_name = "power",
};
static const struct platform_suspend_ops pseries_suspend_ops = {
.valid = suspend_valid_only_mem,
.begin = pseries_suspend_begin,
.prepare_late = pseries_prepare_late,
.enter = pseries_suspend_enter,
};
/**
* pseries_suspend_sysfs_register - Register with sysfs
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_sysfs_register(struct device *dev)
{
int rc;
if ((rc = subsys_system_register(&suspend_subsys, NULL)))
return rc;
dev->id = 0;
dev->bus = &suspend_subsys;
if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
goto subsys_unregister;
return 0;
subsys_unregister:
bus_unregister(&suspend_subsys);
return rc;
}
/**
* pseries_suspend_init - initcall for pSeries suspend
*
* Return value:
* 0 on success / other on failure
**/
static int __init pseries_suspend_init(void)
{
int rc;
if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR))
return 0;
suspend_data.token = rtas_token("ibm,suspend-me");
if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
return 0;
if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
return rc;
ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
suspend_set_ops(&pseries_suspend_ops);
return 0;
}
__initcall(pseries_suspend_init);