M7350v1_en_gpl

2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
@@ -0,0 +1,132 @@
+config PPC_PSERIES
+	depends on PPC64 && PPC_BOOK3S
+	bool "IBM pSeries & new (POWER5-based) iSeries"
+	select HAVE_PCSPKR_PLATFORM
+	select MPIC
+	select OF_DYNAMIC
+	select PCI_MSI
+	select PPC_XICS
+	select PPC_ICP_NATIVE
+	select PPC_ICP_HV
+	select PPC_ICS_RTAS
+	select PPC_I8259
+	select PPC_RTAS
+	select PPC_RTAS_DAEMON
+	select RTAS_ERROR_LOGGING
+	select PPC_UDBG_16550
+	select PPC_NATIVE
+	select PPC_PCI_CHOICE if EXPERT
+	select ZLIB_DEFLATE
+	default y
+
+config PPC_SPLPAR
+	depends on PPC_PSERIES
+	bool "Support for shared-processor logical partitions"
+	default n
+	help
+	  Enabling this option will make the kernel run more efficiently
+	  on logically-partitioned pSeries systems which use shared
+	  processors, that is, which share physical processors between
+	  two or more partitions.
+
+config EEH
+	bool
+	depends on PPC_PSERIES && PCI
+	default y
+
+config PSERIES_MSI
+       bool
+       depends on PCI_MSI && EEH
+       default y
+
+config PSERIES_ENERGY
+	tristate "pSeries energy management capabilities driver"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Provides interface to platform energy management capabilities
+	  on supported PSERIES platforms.
+	  Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
+	  and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
+
+config SCANLOG
+	tristate "Scanlog dump interface"
+	depends on RTAS_PROC && PPC_PSERIES
+
+config IO_EVENT_IRQ
+	bool "IO Event Interrupt support"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option, if you want to enable support for IO Event
+	  interrupts. IO event interrupt is a mechanism provided by RTAS
+	  to return information about hardware error and non-error events
+	  which may need OS attention. RTAS returns events for multiple
+	  event types and scopes. Device drivers can register their handlers
+	  to receive events.
+
+	  This option will only enable the IO event platform code. You
+	  will still need to enable or compile the actual drivers
+	  that use this infrastruture to handle IO event interrupts.
+
+	  Say Y if you are unsure.
+
+config LPARCFG
+	bool "LPAR Configuration Data"
+	depends on PPC_PSERIES
+	help
+	Provide system capacity information via human readable
+	<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
+
+config PPC_PSERIES_DEBUG
+	depends on PPC_PSERIES && PPC_EARLY_DEBUG
+	bool "Enable extra debug logging in platforms/pseries"
+        help
+	  Say Y here if you want the pseries core to produce a bunch of
+	  debug messages to the system log. Select this if you are having a
+	  problem with the pseries core and want to see more of what is
+	  going on. This does not enable debugging in lpar.c, which must
+	  be manually done due to its verbosity.
+	default y
+
+config PPC_SMLPAR
+	bool "Support for shared-memory logical partitions"
+	depends on PPC_PSERIES
+	select LPARCFG
+	default n
+	help
+	  Select this option to enable shared memory partition support.
+	  With this option a system running in an LPAR can be given more
+	  memory than physically available and will allow firmware to
+	  balance memory across many LPARs.
+
+config CMM
+	tristate "Collaborative memory management"
+	depends on PPC_SMLPAR
+	default y
+	help
+	  Select this option, if you want to enable the kernel interface
+	  to reduce the memory size of the system. This is accomplished
+	  by allocating pages of memory and put them "on hold". This only
+	  makes sense for a system running in an LPAR where the unused pages
+	  will be reused for other LPARs. The interface allows firmware to
+	  balance memory across many LPARs.
+
+config DTL
+	bool "Dispatch Trace Log"
+	depends on PPC_SPLPAR && DEBUG_FS
+	help
+	  SPLPAR machines can log hypervisor preempt & dispatch events to a
+	  kernel buffer. Saying Y here will enable logging these events,
+	  which are accessible through a debugfs file.
+
+	  Say N if you are unsure.
+
+config PSERIES_IDLE
+	bool "Cpuidle driver for pSeries platforms"
+	depends on CPU_IDLE
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option to enable processor idle state management
+	  through cpuidle subsystem.
@@ -0,0 +1,29 @@
+ccflags-$(CONFIG_PPC64)			:= -mno-minimal-toc
+ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
+
+obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
+			   setup.o iommu.o event_sources.o ras.o \
+			   firmware.o power.o dlpar.o mobility.o
+obj-$(CONFIG_SMP)	+= smp.o
+obj-$(CONFIG_SCANLOG)	+= scanlog.o
+obj-$(CONFIG_EEH)	+= eeh.o eeh_dev.o eeh_cache.o eeh_driver.o \
+			   eeh_event.o eeh_sysfs.o eeh_pseries.o
+obj-$(CONFIG_KEXEC)	+= kexec.o
+obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
+obj-$(CONFIG_PSERIES_MSI)	+= msi.o
+obj-$(CONFIG_PSERIES_ENERGY)	+= pseries_energy.o
+
+obj-$(CONFIG_HOTPLUG_CPU)	+= hotplug-cpu.o
+obj-$(CONFIG_MEMORY_HOTPLUG)	+= hotplug-memory.o
+
+obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
+obj-$(CONFIG_HVCS)		+= hvcserver.o
+obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
+obj-$(CONFIG_CMM)		+= cmm.o
+obj-$(CONFIG_DTL)		+= dtl.o
+obj-$(CONFIG_IO_EVENT_IRQ)	+= io_event_irq.o
+obj-$(CONFIG_PSERIES_IDLE)	+= processor_idle.o
+
+ifeq ($(CONFIG_PPC_PSERIES),y)
+obj-$(CONFIG_SUSPEND)		+= suspend.o
+endif
@@ -0,0 +1,742 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/device.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <linux/memory.h>
+
+#include "plpar_wrappers.h"
+
+#define CMM_DRIVER_VERSION	"1.0.0"
+#define CMM_DEFAULT_DELAY	1
+#define CMM_HOTPLUG_DELAY	5
+#define CMM_DEBUG			0
+#define CMM_DISABLE		0
+#define CMM_OOM_KB		1024
+#define CMM_MIN_MEM_MB		256
+#define KB2PAGES(_p)		((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p)		((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI	1
+#define CMM_MEM_ISOLATE_PRI	15
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct device cmm_dev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+		 "before loaning resumes. "
+		 "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+		 "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+		 "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+		 "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+	struct cmm_page_array *next;
+	unsigned long index;
+	unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
+static struct task_struct *cmm_thread_ptr;
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr:	number of pages to allocate
+ *
+ * Return value:
+ * 	number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+	struct cmm_page_array *pa, *npa;
+	unsigned long addr;
+	long rc;
+
+	cmm_dbg("Begin request for %ld pages\n", nr);
+
+	while (nr) {
+		/* Exit if a hotplug operation is in progress or occurred */
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				mutex_unlock(&hotplug_mutex);
+				break;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			break;
+		}
+
+		addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+				       __GFP_NORETRY | __GFP_NOMEMALLOC);
+		if (!addr)
+			break;
+		spin_lock(&cmm_lock);
+		pa = cmm_page_list;
+		if (!pa || pa->index >= CMM_NR_PAGES) {
+			/* Need a new page for the page list. */
+			spin_unlock(&cmm_lock);
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				pr_info("%s: Can not allocate new page list\n", __func__);
+				free_page(addr);
+				break;
+			}
+			spin_lock(&cmm_lock);
+			pa = cmm_page_list;
+
+			if (!pa || pa->index >= CMM_NR_PAGES) {
+				npa->next = pa;
+				npa->index = 0;
+				pa = npa;
+				cmm_page_list = pa;
+			} else
+				free_page((unsigned long) npa);
+		}
+
+		if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+			pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
+			spin_unlock(&cmm_lock);
+			free_page(addr);
+			break;
+		}
+
+		pa->page[pa->index++] = addr;
+		loaned_pages++;
+		totalram_pages--;
+		spin_unlock(&cmm_lock);
+		nr--;
+	}
+
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr:	number of pages to free
+ *
+ * Return value:
+ * 	number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+	struct cmm_page_array *pa;
+	unsigned long addr;
+
+	cmm_dbg("Begin free of %ld pages.\n", nr);
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (nr) {
+		if (!pa || pa->index <= 0)
+			break;
+		addr = pa->page[--pa->index];
+
+		if (pa->index == 0) {
+			pa = pa->next;
+			free_page((unsigned long) cmm_page_list);
+			cmm_page_list = pa;
+		}
+
+		plpar_page_set_active(__pa(addr));
+		free_page(addr);
+		loaned_pages--;
+		nr--;
+		totalram_pages++;
+	}
+	spin_unlock(&cmm_lock);
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self:	notifier block struct
+ * @dummy:	not used
+ * @parm:	returned - number of pages freed
+ *
+ * Return value:
+ * 	NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+			  unsigned long dummy, void *parm)
+{
+	unsigned long *freed = parm;
+	long nr = KB2PAGES(oom_kb);
+
+	cmm_dbg("OOM processing started\n");
+	nr = cmm_free_pages(nr);
+	loaned_pages_target = loaned_pages;
+	*freed += KB2PAGES(oom_kb) - nr;
+	oom_freed_pages += KB2PAGES(oom_kb) - nr;
+	cmm_dbg("OOM processing complete\n");
+	return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_get_mpp(void)
+{
+	int rc;
+	struct hvcall_mpp_data mpp_data;
+	signed long active_pages_target, page_loan_request, target;
+	signed long total_pages = totalram_pages + loaned_pages;
+	signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
+
+	rc = h_get_mpp(&mpp_data);
+
+	if (rc != H_SUCCESS)
+		return;
+
+	page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+	target = page_loan_request + (signed long)loaned_pages;
+
+	if (target < 0 || total_pages < min_mem_pages)
+		target = 0;
+
+	if (target > oom_freed_pages)
+		target -= oom_freed_pages;
+	else
+		target = 0;
+
+	active_pages_target = total_pages - target;
+
+	if (min_mem_pages > active_pages_target)
+		target = total_pages - min_mem_pages;
+
+	if (target < 0)
+		target = 0;
+
+	loaned_pages_target = target;
+
+	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+		page_loan_request, loaned_pages, loaned_pages_target,
+		oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+	.notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy:	not used
+ *
+ * Return value:
+ * 	0
+ **/
+static int cmm_thread(void *dummy)
+{
+	unsigned long timeleft;
+
+	while (1) {
+		timeleft = msleep_interruptible(delay * 1000);
+
+		if (kthread_should_stop() || timeleft)
+			break;
+
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				hotplug_occurred = 0;
+				mutex_unlock(&hotplug_mutex);
+				cmm_dbg("Hotplug operation has occurred, "
+						"loaning activity suspended "
+						"for %d seconds.\n",
+						hotplug_delay);
+				timeleft = msleep_interruptible(hotplug_delay *
+						1000);
+				if (kthread_should_stop() || timeleft)
+					break;
+				continue;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			cmm_dbg("Hotplug operation in progress, activity "
+					"suspended\n");
+			continue;
+		}
+
+		cmm_get_mpp();
+
+		if (loaned_pages_target > loaned_pages) {
+			if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+				loaned_pages_target = loaned_pages;
+		} else if (loaned_pages_target < loaned_pages)
+			cmm_free_pages(loaned_pages - loaned_pages_target);
+	}
+	return 0;
+}
+
+#define CMM_SHOW(name, format, args...)			\
+	static ssize_t show_##name(struct device *dev,	\
+				   struct device_attribute *attr,	\
+				   char *buf)			\
+	{							\
+		return sprintf(buf, format, ##args);		\
+	}							\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long val = simple_strtoul (buf, NULL, 10);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (val != 0)
+		return -EBADMSG;
+
+	oom_freed_pages = 0;
+	return count;
+}
+
+static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO,
+		   show_oom_pages, store_oom_pages);
+
+static struct device_attribute *cmm_attrs[] = {
+	&dev_attr_loaned_kb,
+	&dev_attr_loaned_target_kb,
+	&dev_attr_oom_freed_kb,
+};
+
+static struct bus_type cmm_subsys = {
+	.name = "cmm",
+	.dev_name = "cmm",
+};
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct device *dev)
+{
+	int i, rc;
+
+	if ((rc = subsys_system_register(&cmm_subsys, NULL)))
+		return rc;
+
+	dev->id = 0;
+	dev->bus = &cmm_subsys;
+
+	if ((rc = device_register(dev)))
+		goto subsys_unregister;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+		if ((rc = device_create_file(dev, cmm_attrs[i])))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (--i >= 0)
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+subsys_unregister:
+	bus_unregister(&cmm_subsys);
+	return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct device *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+	bus_unregister(&cmm_subsys);
+}
+
+/**
+ * cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
+ *
+ **/
+static int cmm_reboot_notifier(struct notifier_block *nb,
+			       unsigned long action, void *unused)
+{
+	if (action == SYS_RESTART) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cmm_reboot_nb = {
+	.notifier_call = cmm_reboot_notifier,
+};
+
+/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ *      0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+	struct memory_isolate_notify *marg = arg;
+	struct cmm_page_array *pa;
+	unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+	unsigned long idx;
+
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (pa) {
+		if ((unsigned long)pa >= start && (unsigned long)pa < end)
+			marg->pages_found++;
+		for (idx = 0; idx < pa->index; idx++)
+			if (pa->page[idx] >= start && pa->page[idx] < end)
+				marg->pages_found++;
+		pa = pa->next;
+	}
+	spin_unlock(&cmm_lock);
+	return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+				 unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	if (action == MEM_ISOLATE_COUNT)
+		ret = cmm_count_pages(arg);
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+	.notifier_call = cmm_memory_isolate_cb,
+	.priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ *	0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+	struct memory_notify *marg = arg;
+	unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+	struct cmm_page_array *pa_curr, *pa_last, *npa;
+	unsigned long idx;
+	unsigned long freed = 0;
+
+	cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+			start_page, marg->nr_pages);
+	spin_lock(&cmm_lock);
+
+	/* Search the page list for pages in the range to be offlined */
+	pa_last = pa_curr = cmm_page_list;
+	while (pa_curr) {
+		for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+			if ((pa_curr->page[idx] < start_page) ||
+			    (pa_curr->page[idx] >= end_page))
+				continue;
+
+			plpar_page_set_active(__pa(pa_curr->page[idx]));
+			free_page(pa_curr->page[idx]);
+			freed++;
+			loaned_pages--;
+			totalram_pages++;
+			pa_curr->page[idx] = pa_last->page[--pa_last->index];
+			if (pa_last->index == 0) {
+				if (pa_curr == pa_last)
+					pa_curr = pa_last->next;
+				pa_last = pa_last->next;
+				free_page((unsigned long)cmm_page_list);
+				cmm_page_list = pa_last;
+				continue;
+			}
+		}
+		pa_curr = pa_curr->next;
+	}
+
+	/* Search for page list structures in the range to be offlined */
+	pa_last = NULL;
+	pa_curr = cmm_page_list;
+	while (pa_curr) {
+		if (((unsigned long)pa_curr >= start_page) &&
+				((unsigned long)pa_curr < end_page)) {
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				spin_unlock(&cmm_lock);
+				cmm_dbg("Failed to allocate memory for list "
+						"management. Memory hotplug "
+						"failed.\n");
+				return ENOMEM;
+			}
+			memcpy(npa, pa_curr, PAGE_SIZE);
+			if (pa_curr == cmm_page_list)
+				cmm_page_list = npa;
+			if (pa_last)
+				pa_last->next = npa;
+			free_page((unsigned long) pa_curr);
+			freed++;
+			pa_curr = npa;
+		}
+
+		pa_last = pa_curr;
+		pa_curr = pa_curr->next;
+	}
+
+	spin_unlock(&cmm_lock);
+	cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+	return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+			unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		mutex_lock(&hotplug_mutex);
+		hotplug_occurred = 1;
+		ret = cmm_mem_going_offline(arg);
+		break;
+	case MEM_OFFLINE:
+	case MEM_CANCEL_OFFLINE:
+		mutex_unlock(&hotplug_mutex);
+		cmm_dbg("Memory offline operation complete.\n");
+		break;
+	case MEM_GOING_ONLINE:
+	case MEM_ONLINE:
+	case MEM_CANCEL_ONLINE:
+		break;
+	}
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_nb = {
+	.notifier_call = cmm_memory_cb,
+	.priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+	int rc = -ENOMEM;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return -EOPNOTSUPP;
+
+	if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+		return rc;
+
+	if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
+		goto out_oom_notifier;
+
+	if ((rc = cmm_sysfs_register(&cmm_dev)))
+		goto out_reboot_notifier;
+
+	if (register_memory_notifier(&cmm_mem_nb) ||
+	    register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+		goto out_unregister_notifier;
+
+	if (cmm_disabled)
+		return rc;
+
+	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+	if (IS_ERR(cmm_thread_ptr)) {
+		rc = PTR_ERR(cmm_thread_ptr);
+		goto out_unregister_notifier;
+	}
+
+	return rc;
+
+out_unregister_notifier:
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+	cmm_unregister_sysfs(&cmm_dev);
+out_reboot_notifier:
+	unregister_reboot_notifier(&cmm_reboot_nb);
+out_oom_notifier:
+	unregister_oom_notifier(&cmm_oom_nb);
+	return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_exit(void)
+{
+	if (cmm_thread_ptr)
+		kthread_stop(cmm_thread_ptr);
+	unregister_oom_notifier(&cmm_oom_nb);
+	unregister_reboot_notifier(&cmm_reboot_nb);
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+	cmm_free_pages(loaned_pages);
+	cmm_unregister_sysfs(&cmm_dev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, struct kernel_param *kp)
+{
+	int disable = simple_strtoul(val, NULL, 10);
+
+	if (disable != 0 && disable != 1)
+		return -EINVAL;
+
+	if (disable && !cmm_disabled) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	} else if (!disable && cmm_disabled) {
+		cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+		if (IS_ERR(cmm_thread_ptr))
+			return PTR_ERR(cmm_thread_ptr);
+	}
+
+	cmm_disabled = disable;
+	return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+		  &cmm_disabled, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+		 "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
@@ -0,0 +1,565 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include "offline_states.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+
+struct cc_workarea {
+	u32	drc_index;
+	u32	zero;
+	u32	name_offset;
+	u32	prop_length;
+	u32	prop_offset;
+};
+
+void dlpar_free_cc_property(struct property *prop)
+{
+	kfree(prop->name);
+	kfree(prop->value);
+	kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+	struct property *prop;
+	char *name;
+	char *value;
+
+	prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+	if (!prop)
+		return NULL;
+
+	name = (char *)ccwa + ccwa->name_offset;
+	prop->name = kstrdup(name, GFP_KERNEL);
+
+	prop->length = ccwa->prop_length;
+	value = (char *)ccwa + ccwa->prop_offset;
+	prop->value = kmemdup(value, prop->length, GFP_KERNEL);
+	if (!prop->value) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
+
+	return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
+{
+	struct device_node *dn;
+	char *name;
+
+	dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+	if (!dn)
+		return NULL;
+
+	/* The configure connector reported name does not contain a
+	 * preceding '/', so we allocate a buffer large enough to
+	 * prepend this to the full_name.
+	 */
+	name = (char *)ccwa + ccwa->name_offset;
+	dn->full_name = kasprintf(GFP_KERNEL, "/%s", name);
+	if (!dn->full_name) {
+		kfree(dn);
+		return NULL;
+	}
+
+	return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+	struct property *prop;
+
+	while (dn->properties) {
+		prop = dn->properties;
+		dn->properties = prop->next;
+		dlpar_free_cc_property(prop);
+	}
+
+	kfree(dn->full_name);
+	kfree(dn);
+}
+
+void dlpar_free_cc_nodes(struct device_node *dn)
+{
+	if (dn->child)
+		dlpar_free_cc_nodes(dn->child);
+
+	if (dn->sibling)
+		dlpar_free_cc_nodes(dn->sibling);
+
+	dlpar_free_one_cc_node(dn);
+}
+
+#define COMPLETE	0
+#define NEXT_SIBLING    1
+#define NEXT_CHILD      2
+#define NEXT_PROPERTY   3
+#define PREV_PARENT     4
+#define MORE_MEMORY     5
+#define CALL_AGAIN	-2
+#define ERR_CFG_USE     -9003
+
+struct device_node *dlpar_configure_connector(u32 drc_index)
+{
+	struct device_node *dn;
+	struct device_node *first_dn = NULL;
+	struct device_node *last_dn = NULL;
+	struct property *property;
+	struct property *last_property = NULL;
+	struct cc_workarea *ccwa;
+	char *data_buf;
+	int cc_token;
+	int rc = -1;
+
+	cc_token = rtas_token("ibm,configure-connector");
+	if (cc_token == RTAS_UNKNOWN_SERVICE)
+		return NULL;
+
+	data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!data_buf)
+		return NULL;
+
+	ccwa = (struct cc_workarea *)&data_buf[0];
+	ccwa->drc_index = drc_index;
+	ccwa->zero = 0;
+
+	do {
+		/* Since we release the rtas_data_buf lock between configure
+		 * connector calls we want to re-populate the rtas_data_buffer
+		 * with the contents of the previous call.
+		 */
+		spin_lock(&rtas_data_buf_lock);
+
+		memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
+		rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+		memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+		spin_unlock(&rtas_data_buf_lock);
+
+		switch (rc) {
+		case COMPLETE:
+			break;
+
+		case NEXT_SIBLING:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			dn->parent = last_dn->parent;
+			last_dn->sibling = dn;
+			last_dn = dn;
+			break;
+
+		case NEXT_CHILD:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			if (!first_dn)
+				first_dn = dn;
+			else {
+				dn->parent = last_dn;
+				if (last_dn)
+					last_dn->child = dn;
+			}
+
+			last_dn = dn;
+			break;
+
+		case NEXT_PROPERTY:
+			property = dlpar_parse_cc_property(ccwa);
+			if (!property)
+				goto cc_error;
+
+			if (!last_dn->properties)
+				last_dn->properties = property;
+			else
+				last_property->next = property;
+
+			last_property = property;
+			break;
+
+		case PREV_PARENT:
+			last_dn = last_dn->parent;
+			break;
+
+		case CALL_AGAIN:
+			break;
+
+		case MORE_MEMORY:
+		case ERR_CFG_USE:
+		default:
+			printk(KERN_ERR "Unexpected Error (%d) "
+			       "returned from configure-connector\n", rc);
+			goto cc_error;
+		}
+	} while (rc);
+
+cc_error:
+	kfree(data_buf);
+
+	if (rc) {
+		if (first_dn)
+			dlpar_free_cc_nodes(first_dn);
+
+		return NULL;
+	}
+
+	return first_dn;
+}
+
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent;
+	char *last_slash;
+
+	last_slash = strrchr(path, '/');
+	if (last_slash == path) {
+		parent = of_find_node_by_path("/");
+	} else {
+		char *parent_path;
+		int parent_path_len = last_slash - path + 1;
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return NULL;
+
+		strlcpy(parent_path, path, parent_path_len);
+		parent = of_find_node_by_path(parent_path);
+		kfree(parent_path);
+	}
+
+	return parent;
+}
+
+int dlpar_attach_node(struct device_node *dn)
+{
+#ifdef CONFIG_PROC_DEVICETREE
+	struct proc_dir_entry *ent;
+#endif
+	int rc;
+
+	of_node_set_flag(dn, OF_DYNAMIC);
+	kref_init(&dn->kref);
+	dn->parent = derive_parent(dn->full_name);
+	if (!dn->parent)
+		return -ENOMEM;
+
+	rc = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, dn);
+	if (rc) {
+		printk(KERN_ERR "Failed to add device node %s\n",
+		       dn->full_name);
+		return rc;
+	}
+
+	of_attach_node(dn);
+
+#ifdef CONFIG_PROC_DEVICETREE
+	ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(dn, ent);
+#endif
+
+	of_node_put(dn->parent);
+	return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+#ifdef CONFIG_PROC_DEVICETREE
+	struct device_node *parent = dn->parent;
+	struct property *prop = dn->properties;
+
+	while (prop) {
+		remove_proc_entry(prop->name, dn->pde);
+		prop = prop->next;
+	}
+
+	if (dn->pde)
+		remove_proc_entry(dn->pde->name, parent->pde);
+#endif
+
+	pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn);
+	of_detach_node(dn);
+	of_node_put(dn); /* Must decrement the refcount */
+
+	return 0;
+}
+
+#define DR_ENTITY_SENSE		9003
+#define DR_ENTITY_PRESENT	1
+#define DR_ENTITY_UNUSABLE	2
+#define ALLOCATION_STATE	9003
+#define ALLOC_UNUSABLE		0
+#define ALLOC_USABLE		1
+#define ISOLATION_STATE		9001
+#define ISOLATE			0
+#define UNISOLATE		1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_UNUSABLE)
+		return -1;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+	if (rc) {
+		rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+		return rc;
+	}
+
+	return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_PRESENT)
+		return -1;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+	if (rc) {
+		rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+		return rc;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+			BUG_ON(get_cpu_current_state(cpu)
+					!= CPU_STATE_OFFLINE);
+			cpu_maps_update_done();
+			rc = cpu_up(cpu);
+			if (rc)
+				goto out;
+			cpu_maps_update_begin();
+
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to online "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+	struct device_node *dn;
+	unsigned long drc_index;
+	char *cpu_name;
+	int rc;
+
+	cpu_hotplug_driver_lock();
+	rc = strict_strtoul(buf, 0, &drc_index);
+	if (rc) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	dn = dlpar_configure_connector(drc_index);
+	if (!dn) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* configure-connector reports cpus as living in the base
+	 * directory of the device tree.  CPUs actually live in the
+	 * cpus directory so we need to fixup the full_name.
+	 */
+	cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name);
+	if (!cpu_name) {
+		dlpar_free_cc_nodes(dn);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	kfree(dn->full_name);
+	dn->full_name = cpu_name;
+
+	rc = dlpar_acquire_drc(drc_index);
+	if (rc) {
+		dlpar_free_cc_nodes(dn);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = dlpar_attach_node(dn);
+	if (rc) {
+		dlpar_release_drc(drc_index);
+		dlpar_free_cc_nodes(dn);
+		goto out;
+	}
+
+	rc = dlpar_online_cpu(dn);
+out:
+	cpu_hotplug_driver_unlock();
+
+	return rc ? rc : count;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+				break;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+				set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+				cpu_maps_update_done();
+				rc = cpu_down(cpu);
+				if (rc)
+					goto out;
+				cpu_maps_update_begin();
+				break;
+
+			}
+
+			/*
+			 * The cpu is in CPU_STATE_INACTIVE.
+			 * Upgrade it's state to CPU_STATE_OFFLINE.
+			 */
+			set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+			BUG_ON(plpar_hcall_norets(H_PROD, intserv[i])
+								!= H_SUCCESS);
+			__cpu_die(cpu);
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to offline "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+	struct device_node *dn;
+	const u32 *drc_index;
+	int rc;
+
+	dn = of_find_node_by_path(buf);
+	if (!dn)
+		return -EINVAL;
+
+	drc_index = of_get_property(dn, "ibm,my-drc-index", NULL);
+	if (!drc_index) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	cpu_hotplug_driver_lock();
+	rc = dlpar_offline_cpu(dn);
+	if (rc) {
+		of_node_put(dn);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = dlpar_release_drc(*drc_index);
+	if (rc) {
+		of_node_put(dn);
+		goto out;
+	}
+
+	rc = dlpar_detach_node(dn);
+	if (rc) {
+		dlpar_acquire_drc(*drc_index);
+		goto out;
+	}
+
+	of_node_put(dn);
+out:
+	cpu_hotplug_driver_unlock();
+	return rc ? rc : count;
+}
+
+static int __init pseries_dlpar_init(void)
+{
+	ppc_md.cpu_probe = dlpar_cpu_probe;
+	ppc_md.cpu_release = dlpar_cpu_release;
+
+	return 0;
+}
+machine_device_initcall(pseries, pseries_dlpar_init);
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
@@ -0,0 +1,396 @@
+/*
+ * Virtual Processor Dispatch Trace Log
+ *
+ * (C) Copyright IBM Corporation 2009
+ *
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <asm/smp.h>
+#include <asm/uaccess.h>
+#include <asm/firmware.h>
+#include <asm/lppaca.h>
+#include <asm/debug.h>
+
+#include "plpar_wrappers.h"
+
+struct dtl {
+	struct dtl_entry	*buf;
+	struct dentry		*file;
+	int			cpu;
+	int			buf_entries;
+	u64			last_idx;
+	spinlock_t		lock;
+};
+static DEFINE_PER_CPU(struct dtl, cpu_dtl);
+
+/*
+ * Dispatch trace log event mask:
+ * 0x7: 0x1: voluntary virtual processor waits
+ *      0x2: time-slice preempts
+ *      0x4: virtual partition memory page faults
+ */
+static u8 dtl_event_mask = 0x7;
+
+
+/*
+ * Size of per-cpu log buffers. Firmware requires that the buffer does
+ * not cross a 4k boundary.
+ */
+static int dtl_buf_entries = N_DISPATCH_LOG;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+struct dtl_ring {
+	u64	write_index;
+	struct dtl_entry *write_ptr;
+	struct dtl_entry *buf;
+	struct dtl_entry *buf_end;
+	u8	saved_dtl_mask;
+};
+
+static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
+
+static atomic_t dtl_count;
+
+/*
+ * The cpu accounting code controls the DTL ring buffer, and we get
+ * given entries as they are processed.
+ */
+static void consume_dtle(struct dtl_entry *dtle, u64 index)
+{
+	struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings);
+	struct dtl_entry *wp = dtlr->write_ptr;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
+
+	if (!wp)
+		return;
+
+	*wp = *dtle;
+	barrier();
+
+	/* check for hypervisor ring buffer overflow, ignore this entry if so */
+	if (index + N_DISPATCH_LOG < vpa->dtl_idx)
+		return;
+
+	++wp;
+	if (wp == dtlr->buf_end)
+		wp = dtlr->buf;
+	dtlr->write_ptr = wp;
+
+	/* incrementing write_index makes the new entry visible */
+	smp_wmb();
+	++dtlr->write_index;
+}
+
+static int dtl_start(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->buf = dtl->buf;
+	dtlr->buf_end = dtl->buf + dtl->buf_entries;
+	dtlr->write_index = 0;
+
+	/* setting write_ptr enables logging into our buffer */
+	smp_wmb();
+	dtlr->write_ptr = dtl->buf;
+
+	/* enable event logging */
+	dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
+	lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
+
+	dtl_consumer = consume_dtle;
+	atomic_inc(&dtl_count);
+	return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->write_ptr = NULL;
+	smp_wmb();
+
+	dtlr->buf = NULL;
+
+	/* restore dtl_enable_mask */
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+
+	if (atomic_dec_and_test(&dtl_count))
+		dtl_consumer = NULL;
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return per_cpu(dtl_rings, dtl->cpu).write_index;
+}
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int dtl_start(struct dtl *dtl)
+{
+	unsigned long addr;
+	int ret, hwcpu;
+
+	/* Register our dtl buffer with the hypervisor. The HV expects the
+	 * buffer size to be passed in the second word of the buffer */
+	((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
+
+	hwcpu = get_hard_smp_processor_id(dtl->cpu);
+	addr = __pa(dtl->buf);
+	ret = register_dtl(hwcpu, addr);
+	if (ret) {
+		printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
+		       "failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
+		return -EIO;
+	}
+
+	/* set our initial buffer indices */
+	lppaca_of(dtl->cpu).dtl_idx = 0;
+
+	/* ensure that our updates to the lppaca fields have occurred before
+	 * we actually enable the logging */
+	smp_wmb();
+
+	/* enable event logging */
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
+
+	return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+	int hwcpu = get_hard_smp_processor_id(dtl->cpu);
+
+	lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
+
+	unregister_dtl(hwcpu);
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return lppaca_of(dtl->cpu).dtl_idx;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int dtl_enable(struct dtl *dtl)
+{
+	long int n_entries;
+	long int rc;
+	struct dtl_entry *buf = NULL;
+
+	if (!dtl_cache)
+		return -ENOMEM;
+
+	/* only allow one reader */
+	if (dtl->buf)
+		return -EBUSY;
+
+	n_entries = dtl_buf_entries;
+	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
+	if (!buf) {
+		printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
+				__func__, dtl->cpu);
+		return -ENOMEM;
+	}
+
+	spin_lock(&dtl->lock);
+	rc = -EBUSY;
+	if (!dtl->buf) {
+		/* store the original allocation size for use during read */
+		dtl->buf_entries = n_entries;
+		dtl->buf = buf;
+		dtl->last_idx = 0;
+		rc = dtl_start(dtl);
+		if (rc)
+			dtl->buf = NULL;
+	}
+	spin_unlock(&dtl->lock);
+
+	if (rc)
+		kmem_cache_free(dtl_cache, buf);
+	return rc;
+}
+
+static void dtl_disable(struct dtl *dtl)
+{
+	spin_lock(&dtl->lock);
+	dtl_stop(dtl);
+	kmem_cache_free(dtl_cache, dtl->buf);
+	dtl->buf = NULL;
+	dtl->buf_entries = 0;
+	spin_unlock(&dtl->lock);
+}
+
+/* file interface */
+
+static int dtl_file_open(struct inode *inode, struct file *filp)
+{
+	struct dtl *dtl = inode->i_private;
+	int rc;
+
+	rc = dtl_enable(dtl);
+	if (rc)
+		return rc;
+
+	filp->private_data = dtl;
+	return 0;
+}
+
+static int dtl_file_release(struct inode *inode, struct file *filp)
+{
+	struct dtl *dtl = inode->i_private;
+	dtl_disable(dtl);
+	return 0;
+}
+
+static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
+		loff_t *pos)
+{
+	long int rc, n_read, n_req, read_size;
+	struct dtl *dtl;
+	u64 cur_idx, last_idx, i;
+
+	if ((len % sizeof(struct dtl_entry)) != 0)
+		return -EINVAL;
+
+	dtl = filp->private_data;
+
+	/* requested number of entries to read */
+	n_req = len / sizeof(struct dtl_entry);
+
+	/* actual number of entries read */
+	n_read = 0;
+
+	spin_lock(&dtl->lock);
+
+	cur_idx = dtl_current_index(dtl);
+	last_idx = dtl->last_idx;
+
+	if (last_idx + dtl->buf_entries <= cur_idx)
+		last_idx = cur_idx - dtl->buf_entries + 1;
+
+	if (last_idx + n_req > cur_idx)
+		n_req = cur_idx - last_idx;
+
+	if (n_req > 0)
+		dtl->last_idx = last_idx + n_req;
+
+	spin_unlock(&dtl->lock);
+
+	if (n_req <= 0)
+		return 0;
+
+	i = last_idx % dtl->buf_entries;
+
+	/* read the tail of the buffer if we've wrapped */
+	if (i + n_req > dtl->buf_entries) {
+		read_size = dtl->buf_entries - i;
+
+		rc = copy_to_user(buf, &dtl->buf[i],
+				read_size * sizeof(struct dtl_entry));
+		if (rc)
+			return -EFAULT;
+
+		i = 0;
+		n_req -= read_size;
+		n_read += read_size;
+		buf += read_size * sizeof(struct dtl_entry);
+	}
+
+	/* .. and now the head */
+	rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
+	if (rc)
+		return -EFAULT;
+
+	n_read += n_req;
+
+	return n_read * sizeof(struct dtl_entry);
+}
+
+static const struct file_operations dtl_fops = {
+	.open		= dtl_file_open,
+	.release	= dtl_file_release,
+	.read		= dtl_file_read,
+	.llseek		= no_llseek,
+};
+
+static struct dentry *dtl_dir;
+
+static int dtl_setup_file(struct dtl *dtl)
+{
+	char name[10];
+
+	sprintf(name, "cpu-%d", dtl->cpu);
+
+	dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
+	if (!dtl->file)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int dtl_init(void)
+{
+	struct dentry *event_mask_file, *buf_entries_file;
+	int rc, i;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -ENODEV;
+
+	/* set up common debugfs structure */
+
+	rc = -ENOMEM;
+	dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
+	if (!dtl_dir) {
+		printk(KERN_WARNING "%s: can't create dtl root dir\n",
+				__func__);
+		goto err;
+	}
+
+	event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
+				dtl_dir, &dtl_event_mask);
+	buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
+				dtl_dir, &dtl_buf_entries);
+
+	if (!event_mask_file || !buf_entries_file) {
+		printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
+		goto err_remove_dir;
+	}
+
+	/* set up the per-cpu log structures */
+	for_each_possible_cpu(i) {
+		struct dtl *dtl = &per_cpu(cpu_dtl, i);
+		spin_lock_init(&dtl->lock);
+		dtl->cpu = i;
+
+		rc = dtl_setup_file(dtl);
+		if (rc)
+			goto err_remove_dir;
+	}
+
+	return 0;
+
+err_remove_dir:
+	debugfs_remove_recursive(dtl_dir);
+err:
+	return rc;
+}
+arch_initcall(dtl_init);
@@ -0,0 +1,318 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem.  This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+	struct rb_node rb_node;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	struct pci_dev *pcidev;
+	unsigned int flags;
+};
+
+static struct pci_io_addr_cache {
+	struct rb_root rb_root;
+	spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
+{
+	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (addr < piar->addr_lo) {
+			n = n->rb_left;
+		} else {
+			if (addr > piar->addr_hi) {
+				n = n->rb_right;
+			} else {
+				pci_dev_get(piar->pcidev);
+				return piar->pcidev;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * pci_addr_cache_get_device - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address.  Be sure to pci_dev_put the device
+ * when finished.  I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct pci_dev *pci_addr_cache_get_device(unsigned long addr)
+{
+	struct pci_dev *dev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	dev = __pci_addr_cache_get_device(addr);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+	return dev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+	struct rb_node *n;
+	int cnt = 0;
+
+	n = rb_first(&cache->rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+		printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
+		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+		cnt++;
+		n = rb_next(n);
+	}
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+		      unsigned long ahi, unsigned int flags)
+{
+	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pci_io_addr_range *piar;
+
+	/* Walk tree, find a place to insert into tree */
+	while (*p) {
+		parent = *p;
+		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+		if (ahi < piar->addr_lo) {
+			p = &parent->rb_left;
+		} else if (alo > piar->addr_hi) {
+			p = &parent->rb_right;
+		} else {
+			if (dev != piar->pcidev ||
+			    alo != piar->addr_lo || ahi != piar->addr_hi) {
+				printk(KERN_WARNING "PIAR: overlapping address range\n");
+			}
+			return piar;
+		}
+	}
+	piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	if (!piar)
+		return NULL;
+
+	pci_dev_get(dev);
+	piar->addr_lo = alo;
+	piar->addr_hi = ahi;
+	piar->pcidev = dev;
+	piar->flags = flags;
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
+	                  alo, ahi, pci_name(dev));
+#endif
+
+	rb_link_node(&piar->rb_node, parent, p);
+	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+	return piar;
+}
+
+static void __pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	int i;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
+		return;
+	}
+
+	edev = of_node_to_eeh_dev(dn);
+	if (!edev) {
+		pr_warning("PCI: no EEH dev found for dn=%s\n",
+			dn->full_name);
+		return;
+	}
+
+	/* Skip any devices for which EEH is not enabled. */
+	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
+	    edev->mode & EEH_MODE_NOCHECK) {
+#ifdef DEBUG
+		pr_info("PCI: skip building address cache for=%s - %s\n",
+			pci_name(dev), dn->full_name);
+#endif
+		return;
+	}
+
+	/* Walk resources on this device, poke them into the tree */
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		unsigned long start = pci_resource_start(dev,i);
+		unsigned long end = pci_resource_end(dev,i);
+		unsigned int flags = pci_resource_flags(dev,i);
+
+		/* We are interested only bus addresses, not dma or other stuff */
+		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+			 continue;
+		pci_addr_cache_insert(dev, start, end, flags);
+	}
+}
+
+/**
+ * pci_addr_cache_insert_device - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	/* Ignore PCI bridges */
+	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+		return;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__pci_addr_cache_insert_device(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+	struct rb_node *n;
+
+restart:
+	n = rb_first(&pci_io_addr_cache_root.rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (piar->pcidev == dev) {
+			rb_erase(n, &pci_io_addr_cache_root.rb_root);
+			pci_dev_put(piar->pcidev);
+			kfree(piar);
+			goto restart;
+		}
+		n = rb_next(n);
+	}
+}
+
+/**
+ * pci_addr_cache_remove_device - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__pci_addr_cache_remove_device(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * pci_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void __init pci_addr_cache_build(void)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	struct pci_dev *dev = NULL;
+
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+	for_each_pci_dev(dev) {
+		pci_addr_cache_insert_device(dev);
+
+		dn = pci_device_to_OF_node(dev);
+		if (!dn)
+			continue;
+
+		edev = of_node_to_eeh_dev(dn);
+		if (!edev)
+			continue;
+
+		pci_dev_get(dev);  /* matching put is in eeh_remove_device() */
+		dev->dev.archdata.edev = edev;
+		edev->pdev = dev;
+
+		eeh_sysfs_add_device(dev);
+	}
+
+#ifdef DEBUG
+	/* Verify tree built up above, echo back the list of addrs. */
+	pci_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
+
@@ -0,0 +1,102 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ *    PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ *    the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ *    will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ *    PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @dn: device node
+ * @data: PHB
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+void * __devinit eeh_dev_init(struct device_node *dn, void *data)
+{
+	struct pci_controller *phb = data;
+	struct eeh_dev *edev;
+
+	/* Allocate EEH device */
+	edev = zalloc_maybe_bootmem(sizeof(*edev), GFP_KERNEL);
+	if (!edev) {
+		pr_warning("%s: out of memory\n", __func__);
+		return NULL;
+	}
+
+	/* Associate EEH device with OF node */
+	PCI_DN(dn)->edev = edev;
+	edev->dn  = dn;
+	edev->phb = phb;
+
+	return NULL;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node *dn = phb->dn;
+
+	/* EEH device for PHB */
+	eeh_dev_init(dn, phb);
+
+	/* EEH devices for children OF nodes */
+	traverse_pci_devices(dn, eeh_dev_init, phb);
+}
+
+/**
+ * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
+ *
+ * Scan all the existing PHBs and create EEH devices for their OF
+ * nodes and their children OF nodes
+ */
+void __init eeh_dev_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		eeh_dev_phb_init_dynamic(phb);
+}
@@ -0,0 +1,538 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/**
+ * eeh_pcid_name - Retrieve name of PCI device driver
+ * @pdev: PCI device
+ *
+ * This routine is used to retrieve the name of PCI device driver
+ * if that's valid.
+ */
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
+{
+	if (pdev && pdev->dev.driver)
+		return pdev->dev.driver->name;
+	return "";
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+	int i;
+	struct device_node *pc;
+
+	if (!pdn)
+		return;
+	for (i = 0; i < dent; i++)
+		printk(" ");
+	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+		pdn->eeh_pe_config_addr, pdn->node->full_name);
+	dent += 3;
+	pc = pdn->node->child;
+	while (pc) {
+		print_device_node_tree(PCI_DN(pc), dent);
+		pc = pc->sibling;
+	}
+}
+#endif
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	/* Don't disable MSI and MSI-X interrupts. They are
+	 * effectively disabled by the DMA Stopped state
+	 * when an EEH error occurs.
+	 */
+	if (dev->msi_enabled || dev->msix_enabled)
+		return;
+
+	if (!irq_has_action(dev->irq))
+		return;
+
+	edev->mode |= EEH_MODE_IRQ_DISABLED;
+	disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	if ((edev->mode) & EEH_MODE_IRQ_DISABLED) {
+		edev->mode &= ~EEH_MODE_IRQ_DISABLED;
+		enable_irq(dev->irq);
+	}
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @dev: PCI device
+ * @userdata: return value
+ * 
+ * Report an EEH error to each device driver, collect up and 
+ * merge the device driver responses. Cumulative response 
+ * passed back in "userdata".
+ */
+static int eeh_report_error(struct pci_dev *dev, void *userdata)
+{
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver = dev->driver;
+
+	dev->error_state = pci_channel_io_frozen;
+
+	if (!driver)
+		return 0;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected)
+		return 0;
+
+	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @dev: PCI device
+ * @userdata: return value
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+{
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver = dev->driver;
+
+	if (!driver ||
+	    !driver->err_handler ||
+	    !driver->err_handler->mmio_enabled)
+		return 0;
+
+	rc = driver->err_handler->mmio_enabled(dev);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @dev: PCI device
+ * @userdata: return value
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static int eeh_report_reset(struct pci_dev *dev, void *userdata)
+{
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver = dev->driver;
+
+	if (!driver)
+		return 0;
+
+	dev->error_state = pci_channel_io_normal;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->slot_reset)
+		return 0;
+
+	rc = driver->err_handler->slot_reset(dev);
+	if ((*res == PCI_ERS_RESULT_NONE) ||
+	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+	if (*res == PCI_ERS_RESULT_DISCONNECT &&
+	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+	return 0;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @dev: PCI device
+ * @userdata: return value
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static int eeh_report_resume(struct pci_dev *dev, void *userdata)
+{
+	struct pci_driver *driver = dev->driver;
+
+	dev->error_state = pci_channel_io_normal;
+
+	if (!driver)
+		return 0;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->resume)
+		return 0;
+
+	driver->err_handler->resume(dev);
+
+	return 0;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @dev: PCI device
+ * @userdata: return value
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static int eeh_report_failure(struct pci_dev *dev, void *userdata)
+{
+	struct pci_driver *driver = dev->driver;
+
+	dev->error_state = pci_channel_io_perm_failure;
+
+	if (!driver)
+		return 0;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected)
+		return 0;
+
+	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+	return 0;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @edev: PE associated EEH device
+ * @bus: PCI bus corresponding to the isolcated slot
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
+{
+	struct device_node *dn;
+	int cnt, rc;
+
+	/* pcibios will clear the counter; save the value */
+	cnt = edev->freeze_count;
+
+	if (bus)
+		pcibios_remove_pci_devices(bus);
+
+	/* Reset the pci controller. (Asserts RST#; resets config space).
+	 * Reconfigure bridges and devices. Don't try to bring the system
+	 * up if the reset failed for some reason.
+	 */
+	rc = eeh_reset_pe(edev);
+	if (rc)
+		return rc;
+
+	/* Walk over all functions on this device. */
+	dn = eeh_dev_to_of_node(edev);
+	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
+		dn = dn->parent->child;
+
+	while (dn) {
+		struct eeh_dev *pedev = of_node_to_eeh_dev(dn);
+
+		/* On Power4, always true because eeh_pe_config_addr=0 */
+		if (edev->pe_config_addr == pedev->pe_config_addr) {
+			eeh_ops->configure_bridge(dn);
+			eeh_restore_bars(pedev);
+ 		}
+		dn = dn->sibling;
+	}
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
+	 * this is a hack, but if we don't do this, and try to bring 
+	 * the device up before the scripts have taken it down, 
+	 * potentially weird things happen.
+	 */
+	if (bus) {
+		ssleep(5);
+		pcibios_add_pci_devices(bus);
+	}
+	edev->freeze_count = cnt;
+
+	return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 150
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @event: EEH event
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+struct eeh_dev *handle_eeh_events(struct eeh_event *event)
+{
+	struct device_node *frozen_dn;
+	struct eeh_dev *frozen_edev;
+	struct pci_bus *frozen_bus;
+	int rc = 0;
+	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+	const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
+
+	frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev));
+	if (!frozen_dn) {
+		location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL);
+		location = location ? location : "unknown";
+		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
+		                "for location=%s pci addr=%s\n",
+			location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev)));
+		return NULL;
+	}
+
+	frozen_bus = pcibios_find_pci_bus(frozen_dn);
+	location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
+	location = location ? location : "unknown";
+
+	/* There are two different styles for coming up with the PE.
+	 * In the old style, it was the highest EEH-capable device
+	 * which was always an EADS pci bridge.  In the new style,
+	 * there might not be any EADS bridges, and even when there are,
+	 * the firmware marks them as "EEH incapable". So another
+	 * two-step is needed to find the pci bus..
+	 */
+	if (!frozen_bus)
+		frozen_bus = pcibios_find_pci_bus(frozen_dn->parent);
+
+	if (!frozen_bus) {
+		printk(KERN_ERR "EEH: Cannot find PCI bus "
+		        "for location=%s dn=%s\n",
+		        location, frozen_dn->full_name);
+		return NULL;
+	}
+
+	frozen_edev = of_node_to_eeh_dev(frozen_dn);
+	frozen_edev->freeze_count++;
+	pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev));
+	drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev));
+
+	if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+		goto excess_failures;
+
+	printk(KERN_WARNING
+	   "EEH: This PCI device has failed %d times in the last hour:\n",
+		frozen_edev->freeze_count);
+
+	if (frozen_edev->pdev) {
+		bus_pci_str = pci_name(frozen_edev->pdev);
+		bus_drv_str = eeh_pcid_name(frozen_edev->pdev);
+		printk(KERN_WARNING
+			"EEH: Bus location=%s driver=%s pci addr=%s\n",
+			location, bus_drv_str, bus_pci_str);
+	}
+
+	printk(KERN_WARNING
+		"EEH: Device location=%s driver=%s pci addr=%s\n",
+		location, drv_str, pci_str);
+
+	/* Walk the various device drivers attached to this slot through
+	 * a reset sequence, giving each an opportunity to do what it needs
+	 * to accomplish the reset.  Each child gets a report of the
+	 * status ... if any child can't handle the reset, then the entire
+	 * slot is dlpar removed and added.
+	 */
+	pci_walk_bus(frozen_bus, eeh_report_error, &result);
+
+	/* Get the current PCI slot state. This can take a long time,
+	 * sometimes over 3 seconds for certain systems.
+	 */
+	rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000);
+	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+		printk(KERN_WARNING "EEH: Permanent failure\n");
+		goto hard_fail;
+	}
+
+	/* Since rtas may enable MMIO when posting the error log,
+	 * don't post the error log until after all dev drivers
+	 * have been informed.
+	 */
+	eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP);
+
+	/* If all device drivers were EEH-unaware, then shut
+	 * down all of the device drivers, and hope they
+	 * go down willingly, without panicing the system.
+	 */
+	if (result == PCI_ERS_RESULT_NONE) {
+		rc = eeh_reset_device(frozen_edev, frozen_bus);
+		if (rc) {
+			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
+			goto hard_fail;
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable MMIO */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			result = PCI_ERS_RESULT_NONE;
+			pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable DMA */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc)
+			result = PCI_ERS_RESULT_NEED_RESET;
+		else
+			result = PCI_ERS_RESULT_RECOVERED;
+	}
+
+	/* If any device has a hard failure, then shut off everything. */
+	if (result == PCI_ERS_RESULT_DISCONNECT) {
+		printk(KERN_WARNING "EEH: Device driver gave up\n");
+		goto hard_fail;
+	}
+
+	/* If any device called out for a reset, then reset the slot */
+	if (result == PCI_ERS_RESULT_NEED_RESET) {
+		rc = eeh_reset_device(frozen_edev, NULL);
+		if (rc) {
+			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
+			goto hard_fail;
+		}
+		result = PCI_ERS_RESULT_NONE;
+		pci_walk_bus(frozen_bus, eeh_report_reset, &result);
+	}
+
+	/* All devices should claim they have recovered by now. */
+	if ((result != PCI_ERS_RESULT_RECOVERED) &&
+	    (result != PCI_ERS_RESULT_NONE)) {
+		printk(KERN_WARNING "EEH: Not recovered\n");
+		goto hard_fail;
+	}
+
+	/* Tell all device drivers that they can resume operations */
+	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
+
+	return frozen_edev;
+	
+excess_failures:
+	/*
+	 * About 90% of all real-life EEH failures in the field
+	 * are due to poorly seated PCI cards. Only 10% or so are
+	 * due to actual, failed cards.
+	 */
+	printk(KERN_ERR
+	   "EEH: PCI device at location=%s driver=%s pci addr=%s\n"
+		"has failed %d times in the last hour "
+		"and has been permanently disabled.\n"
+		"Please try reseating this device or replacing it.\n",
+		location, drv_str, pci_str, frozen_edev->freeze_count);
+	goto perm_error;
+
+hard_fail:
+	printk(KERN_ERR
+	   "EEH: Unable to recover from failure of PCI device "
+	   "at location=%s driver=%s pci addr=%s\n"
+	   "Please try reseating this device or replacing it.\n",
+		location, drv_str, pci_str);
+
+perm_error:
+	eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM);
+
+	/* Notify all devices that they're about to go down. */
+	pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
+
+	/* Shut down the device drivers for good. */
+	pcibios_remove_pci_devices(frozen_bus);
+
+	return NULL;
+}
+
@@ -0,0 +1,159 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ *  EEH error states may be detected within exception handlers;
+ *  however, the recovery processing needs to occur asynchronously
+ *  in a normal kernel context and not an interrupt context.
+ *  This pair of routines creates an event and queues it onto a
+ *  work-queue, where a worker thread can drive recovery.
+ */
+
+/* EEH event workqueue setup. */
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+LIST_HEAD(eeh_eventlist);
+static void eeh_thread_launcher(struct work_struct *);
+DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
+
+/* Serialize reset sequences for a given pci device */
+DEFINE_MUTEX(eeh_event_mutex);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it.  The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+	struct eeh_dev *edev;
+
+	set_task_comm(current, "eehd");
+
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	event = NULL;
+
+	/* Unqueue the event, get ready to process. */
+	if (!list_empty(&eeh_eventlist)) {
+		event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+		list_del(&event->list);
+	}
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	if (event == NULL)
+		return 0;
+
+	/* Serialize processing of EEH events */
+	mutex_lock(&eeh_event_mutex);
+	edev = event->edev;
+	eeh_mark_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
+
+	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+	       eeh_pci_name(edev->pdev));
+
+	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
+	edev = handle_eeh_events(event);
+
+	eeh_clear_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
+	pci_dev_put(edev->pdev);
+
+	kfree(event);
+	mutex_unlock(&eeh_event_mutex);
+
+	/* If there are no new errors after an hour, clear the counter. */
+	if (edev && edev->freeze_count>0) {
+		msleep_interruptible(3600*1000);
+		if (edev->freeze_count>0)
+			edev->freeze_count--;
+
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_thread_launcher - Start kernel thread to handle EEH events
+ * @dummy - unused
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+static void eeh_thread_launcher(struct work_struct *dummy)
+{
+	if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
+		printk(KERN_ERR "Failed to start EEH daemon\n");
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @edev: EEH device
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_dev *edev)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+	const char *location;
+
+	if (!mem_init_done) {
+		printk(KERN_ERR "EEH: event during early boot not handled\n");
+		location = of_get_property(dn, "ibm,loc-code", NULL);
+		printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
+		printk(KERN_ERR "EEH: PCI location = %s\n", location);
+		return 1;
+	}
+	event = kmalloc(sizeof(*event), GFP_ATOMIC);
+	if (event == NULL) {
+		printk(KERN_ERR "EEH: out of memory, event not handled\n");
+		return 1;
+ 	}
+
+	if (edev->pdev)
+		pci_dev_get(edev->pdev);
+
+	event->edev = edev;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_add(&event->list, &eeh_eventlist);
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	schedule_work(&eeh_event_wq);
+
+	return 0;
+}
@@ -0,0 +1,565 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on pseries.
+ * Actually, the pseries platform is built based on RTAS heavily. That means the
+ * pseries platform dependent EEH operations will be built on RTAS calls. The functions
+ * are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
+ * been done.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_bridge;
+static int ibm_configure_pe;
+
+/*
+ * Buffer for reporting slot-error-detail rtas calls. Its here
+ * in BSS, and not dynamically alloced, so that it ends up in
+ * RMO where RTAS can access it.
+ */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/**
+ * pseries_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on pseries.
+ */
+static int pseries_eeh_init(void)
+{
+	/* figure out EEH RTAS function call tokens */
+	ibm_set_eeh_option		= rtas_token("ibm,set-eeh-option");
+	ibm_set_slot_reset		= rtas_token("ibm,set-slot-reset");
+	ibm_read_slot_reset_state2	= rtas_token("ibm,read-slot-reset-state2");
+	ibm_read_slot_reset_state	= rtas_token("ibm,read-slot-reset-state");
+	ibm_slot_error_detail		= rtas_token("ibm,slot-error-detail");
+	ibm_get_config_addr_info2	= rtas_token("ibm,get-config-addr-info2");
+	ibm_get_config_addr_info	= rtas_token("ibm,get-config-addr-info");
+	ibm_configure_pe		= rtas_token("ibm,configure-pe");
+	ibm_configure_bridge		= rtas_token ("ibm,configure-bridge");
+
+	/* necessary sanity check */
+	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm, set-slot-reset> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
+		   ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and "
+			"<ibm,read-slot-reset-state> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_get_config_addr_info2 == RTAS_UNKNOWN_SERVICE &&
+		   ibm_get_config_addr_info == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,get-config-addr-info2> and "
+			"<ibm,get-config-addr-info> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE &&
+		   ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,configure-pe> and "
+			"<ibm,configure-bridge> invalid\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	/* Initialize error log lock and size */
+	spin_lock_init(&slot_errbuf_lock);
+	eeh_error_buf_size = rtas_token("rtas-error-log-max");
+	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: unknown EEH error log size\n",
+			__func__);
+		eeh_error_buf_size = 1024;
+	} else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+		pr_warning("%s: EEH error log size %d exceeds the maximal %d\n",
+			__func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @dn: device node
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pseries_eeh_set_option(struct device_node *dn, int option)
+{
+	int ret = 0;
+	struct eeh_dev *edev;
+	const u32 *reg;
+	int config_addr;
+
+	edev = of_node_to_eeh_dev(dn);
+
+	/*
+	 * When we're enabling or disabling EEH functioality on
+	 * the particular PE, the PE config address is possibly
+	 * unavailable. Therefore, we have to figure it out from
+	 * the FDT node.
+	 */
+	switch (option) {
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:
+		reg = of_get_property(dn, "reg", NULL);
+		config_addr = reg[0];
+		break;
+
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+		config_addr = edev->config_addr;
+		if (edev->pe_config_addr)
+			config_addr = edev->pe_config_addr;
+		break;
+
+	default:
+		pr_err("%s: Invalid option %d\n",
+			__func__, option);
+		return -EINVAL;
+	}
+
+	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+			config_addr, BUID_HI(edev->phb->buid),
+			BUID_LO(edev->phb->buid), option);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_pe_addr - Retrieve PE address
+ * @dn: device node
+ *
+ * Retrieve the assocated PE address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ * It's notable that zero'ed return value means invalid PE config
+ * address.
+ */
+static int pseries_eeh_get_pe_addr(struct device_node *dn)
+{
+	struct eeh_dev *edev;
+	int ret = 0;
+	int rets[3];
+
+	edev = of_node_to_eeh_dev(dn);
+
+	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+		/*
+		 * First of all, we need to make sure there has one PE
+		 * associated with the device. Otherwise, PE address is
+		 * meaningless.
+		 */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 1);
+		if (ret || (rets[0] == 0))
+			return 0;
+
+		/* Retrieve the associated PE config address */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get PE address for %s\n",
+				__func__, dn->full_name);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get PE address for %s\n",
+				__func__, dn->full_name);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_state - Retrieve PE state
+ * @dn: PE associated device node
+ * @state: return value
+ *
+ * Retrieve the state of the specified PE. On RTAS compliant
+ * pseries platform, there already has one dedicated RTAS function
+ * for the purpose. It's notable that the associated PE config address
+ * might be ready when calling the function. Therefore, endeavour to
+ * use the PE config address if possible. Further more, there're 2
+ * RTAS calls for the purpose, we need to try the new one and back
+ * to the old one if the new one couldn't work properly.
+ */
+static int pseries_eeh_get_state(struct device_node *dn, int *state)
+{
+	struct eeh_dev *edev;
+	int config_addr;
+	int ret;
+	int rets[4];
+	int result;
+
+	/* Figure out PE config address if possible */
+	edev = of_node_to_eeh_dev(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
+
+	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
+	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
+		/* Fake PE unavailable info */
+		rets[2] = 0;
+		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
+	} else {
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	if (ret)
+		return ret;
+
+	/* Parse the result out */
+	result = 0;
+	if (rets[1]) {
+		switch(rets[0]) {
+		case 0:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result |= EEH_STATE_MMIO_ACTIVE;
+			result |= EEH_STATE_DMA_ACTIVE;
+			break;
+		case 1:
+			result |= EEH_STATE_RESET_ACTIVE;
+			result |= EEH_STATE_MMIO_ACTIVE;
+			result |= EEH_STATE_DMA_ACTIVE;
+			break;
+		case 2:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result &= ~EEH_STATE_MMIO_ACTIVE;
+			result &= ~EEH_STATE_DMA_ACTIVE;
+			break;
+		case 4:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result &= ~EEH_STATE_MMIO_ACTIVE;
+			result &= ~EEH_STATE_DMA_ACTIVE;
+			result |= EEH_STATE_MMIO_ENABLED;
+			break;
+		case 5:
+			if (rets[2]) {
+				if (state) *state = rets[2];
+				result = EEH_STATE_UNAVAILABLE;
+			} else {
+				result = EEH_STATE_NOT_SUPPORT;
+			}
+		default:
+			result = EEH_STATE_NOT_SUPPORT;
+		}
+	} else {
+		result = EEH_STATE_NOT_SUPPORT;
+	}
+
+	return result;
+}
+
+/**
+ * pseries_eeh_reset - Reset the specified PE
+ * @dn: PE associated device node
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int pseries_eeh_reset(struct device_node *dn, int option)
+{
+	struct eeh_dev *edev;
+	int config_addr;
+	int ret;
+
+	/* Figure out PE address */
+	edev = of_node_to_eeh_dev(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
+
+	/* Reset PE through RTAS call */
+	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+			config_addr, BUID_HI(edev->phb->buid),
+			BUID_LO(edev->phb->buid), option);
+
+	/* If fundamental-reset not supported, try hot-reset */
+	if (option == EEH_RESET_FUNDAMENTAL &&
+	    ret == -8) {
+		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), EEH_RESET_HOT);
+	}
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_wait_state - Wait for PE state
+ * @dn: PE associated device node
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
+{
+	int ret;
+	int mwait;
+
+	/*
+	 * According to PAPR, the state of PE might be temporarily
+	 * unavailable. Under the circumstance, we have to wait
+	 * for indicated time determined by firmware. The maximal
+	 * wait time is 5 minutes, which is acquired from the original
+	 * EEH implementation. Also, the original implementation
+	 * also defined the minimal wait time as 1 second.
+	 */
+#define EEH_STATE_MIN_WAIT_TIME	(1000)
+#define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
+
+	while (1) {
+		ret = pseries_eeh_get_state(dn, &mwait);
+
+		/*
+		 * If the PE's state is temporarily unavailable,
+		 * we have to wait for the specified time. Otherwise,
+		 * the PE's state will be returned immediately.
+		 */
+		if (ret != EEH_STATE_UNAVAILABLE)
+			return ret;
+
+		if (max_wait <= 0) {
+			pr_warning("%s: Timeout when getting PE's state (%d)\n",
+				__func__, max_wait);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+
+		if (mwait <= 0) {
+			pr_warning("%s: Firmware returned bad wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MIN_WAIT_TIME;
+		} else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
+			pr_warning("%s: Firmware returned too long wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MAX_WAIT_TIME;
+		}
+
+		max_wait -= mwait;
+		msleep(mwait);
+	}
+
+	return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pseries_eeh_get_log - Retrieve error log
+ * @dn: device node
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ * Actually, the error will be retrieved through the dedicated
+ * RTAS call.
+ */
+static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
+{
+	struct eeh_dev *edev;
+	int config_addr;
+	unsigned long flags;
+	int ret;
+
+	edev = of_node_to_eeh_dev(dn);
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	/* Figure out the PE address */
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
+
+	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
+			BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
+			virt_to_phys(drv_log), len,
+			virt_to_phys(slot_errbuf), eeh_error_buf_size,
+			severity);
+	if (!ret)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @dn: PE associated device node
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pseries_eeh_configure_bridge(struct device_node *dn)
+{
+	struct eeh_dev *edev;
+	int config_addr;
+	int ret;
+
+	/* Figure out the PE address */
+	edev = of_node_to_eeh_dev(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
+
+	/* Use new configure-pe function, if supported */
+	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
+	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
+	} else {
+		return -EFAULT;
+	}
+
+	if (ret)
+		pr_warning("%s: Unable to configure bridge %d for %s\n",
+			__func__, ret, dn->full_name);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_read_config - Read PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to read
+ * @val: return value
+ *
+ * Read config space from the speicifed device
+ */
+static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val)
+{
+	struct pci_dn *pdn;
+
+	pdn = PCI_DN(dn);
+
+	return rtas_read_config(pdn, where, size, val);
+}
+
+/**
+ * pseries_eeh_write_config - Write PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to write
+ * @val: value to be written
+ *
+ * Write config space to the specified device
+ */
+static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val)
+{
+	struct pci_dn *pdn;
+
+	pdn = PCI_DN(dn);
+
+	return rtas_write_config(pdn, where, size, val);
+}
+
+static struct eeh_ops pseries_eeh_ops = {
+	.name			= "pseries",
+	.init			= pseries_eeh_init,
+	.set_option		= pseries_eeh_set_option,
+	.get_pe_addr		= pseries_eeh_get_pe_addr,
+	.get_state		= pseries_eeh_get_state,
+	.reset			= pseries_eeh_reset,
+	.wait_state		= pseries_eeh_wait_state,
+	.get_log		= pseries_eeh_get_log,
+	.configure_bridge       = pseries_eeh_configure_bridge,
+	.read_config		= pseries_eeh_read_config,
+	.write_config		= pseries_eeh_write_config
+};
+
+/**
+ * eeh_pseries_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on pseries platform. This function should be
+ * called before any EEH related functions.
+ */
+int __init eeh_pseries_init(void)
+{
+	return eeh_ops_register(&pseries_eeh_ops);
+}
@@ -0,0 +1,84 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format)               \
+static ssize_t eeh_show_##_name(struct device *dev,      \
+		struct device_attribute *attr, char *buf)          \
+{                                                        \
+	struct pci_dev *pdev = to_pci_dev(dev);               \
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);      \
+	                                                      \
+	if (!edev)                                            \
+		return 0;                                     \
+	                                                      \
+	return sprintf(buf, _format "\n", edev->_memb);       \
+}                                                        \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
+EEH_SHOW_ATTR(eeh_check_count,     check_count,     "%d"  );
+EEH_SHOW_ATTR(eeh_freeze_count,    freeze_count,    "%d"  );
+EEH_SHOW_ATTR(eeh_false_positives, false_positives, "%d"  );
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+	int rc=0;
+
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
+
+	if (rc)
+		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
+}
+
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/prom.h>
+
+#include "pseries.h"
+
+void request_event_sources_irqs(struct device_node *np,
+				irq_handler_t handler,
+				const char *name)
+{
+	int i, index, count = 0;
+	struct of_irq oirq;
+	const u32 *opicprop;
+	unsigned int opicplen;
+	unsigned int virqs[16];
+
+	/* Check for obsolete "open-pic-interrupt" property. If present, then
+	 * map those interrupts using the default interrupt host and default
+	 * trigger
+	 */
+	opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
+	if (opicprop) {
+		opicplen /= sizeof(u32);
+		for (i = 0; i < opicplen; i++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_mapping(NULL, *(opicprop++));
+			if (virqs[count] == NO_IRQ) {
+				pr_err("event-sources: Unable to allocate "
+				       "interrupt number for %s\n",
+				       np->full_name);
+				WARN_ON(1);
+			}
+			else
+				count++;
+
+		}
+	}
+	/* Else use normal interrupt tree parsing */
+	else {
+		/* First try to do a proper OF tree parsing */
+		for (index = 0; of_irq_map_one(np, index, &oirq) == 0;
+		     index++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_of_mapping(oirq.controller,
+							    oirq.specifier,
+							    oirq.size);
+			if (virqs[count] == NO_IRQ) {
+				pr_err("event-sources: Unable to allocate "
+				       "interrupt number for %s\n",
+				       np->full_name);
+				WARN_ON(1);
+			}
+			else
+				count++;
+		}
+	}
+
+	/* Now request them */
+	for (i = 0; i < count; i++) {
+		if (request_irq(virqs[i], handler, 0, name, NULL)) {
+			pr_err("event-sources: Unable to request interrupt "
+			       "%d for %s\n", virqs[i], np->full_name);
+			WARN_ON(1);
+			return;
+		}
+	}
+}
+
@@ -0,0 +1,87 @@
+/*
+ *  pSeries firmware setup code.
+ *
+ *  Portions from arch/powerpc/platforms/pseries/setup.c:
+ *   Copyright (C) 1995  Linus Torvalds
+ *   Adapted from 'alpha' version by Gary Thomas
+ *   Modified by Cort Dougan (cort@cs.nmt.edu)
+ *   Modified by PPC64 Team, IBM Corp
+ *
+ *  Portions from arch/powerpc/kernel/firmware.c
+ *   Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *   Modifications for ppc64:
+ *    Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *    Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ *  Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <asm/firmware.h>
+#include <asm/prom.h>
+#include <asm/udbg.h>
+
+#include "pseries.h"
+
+typedef struct {
+    unsigned long val;
+    char * name;
+} firmware_feature_t;
+
+static __initdata firmware_feature_t
+firmware_features_table[FIRMWARE_MAX_FEATURES] = {
+	{FW_FEATURE_PFT,		"hcall-pft"},
+	{FW_FEATURE_TCE,		"hcall-tce"},
+	{FW_FEATURE_SPRG0,		"hcall-sprg0"},
+	{FW_FEATURE_DABR,		"hcall-dabr"},
+	{FW_FEATURE_COPY,		"hcall-copy"},
+	{FW_FEATURE_ASR,		"hcall-asr"},
+	{FW_FEATURE_DEBUG,		"hcall-debug"},
+	{FW_FEATURE_PERF,		"hcall-perf"},
+	{FW_FEATURE_DUMP,		"hcall-dump"},
+	{FW_FEATURE_INTERRUPT,		"hcall-interrupt"},
+	{FW_FEATURE_MIGRATE,		"hcall-migrate"},
+	{FW_FEATURE_PERFMON,		"hcall-perfmon"},
+	{FW_FEATURE_CRQ,		"hcall-crq"},
+	{FW_FEATURE_VIO,		"hcall-vio"},
+	{FW_FEATURE_RDMA,		"hcall-rdma"},
+	{FW_FEATURE_LLAN,		"hcall-lLAN"},
+	{FW_FEATURE_BULK_REMOVE,	"hcall-bulk"},
+	{FW_FEATURE_XDABR,		"hcall-xdabr"},
+	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
+	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
+	{FW_FEATURE_VPHN,		"hcall-vphn"},
+};
+
+/* Build up the firmware features bitmask using the contents of
+ * device-tree/ibm,hypertas-functions.  Ultimately this functionality may
+ * be moved into prom.c prom_init().
+ */
+void __init fw_feature_init(const char *hypertas, unsigned long len)
+{
+	const char *s;
+	int i;
+
+	pr_debug(" -> fw_feature_init()\n");
+
+	for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
+		for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) {
+			/* check value against table of strings */
+			if (!firmware_features_table[i].name ||
+			    strcmp(firmware_features_table[i].name, s))
+				continue;
+
+			/* we have a match */
+			powerpc_firmware_features |=
+				firmware_features_table[i].val;
+			break;
+		}
+	}
+
+	pr_debug(" <- fw_feature_init()\n");
+}
@@ -0,0 +1,414 @@
+/*
+ * pseries CPU Hotplug infrastructure.
+ *
+ * Split out from arch/powerpc/platforms/pseries/setup.c
+ *  arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
+ *
+ * Peter Bergner, IBM	March 2001.
+ * Copyright (C) 2001 IBM.
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ * Plus various changes from other IBM teams...
+ *
+ * Copyright (C) 2006 Michael Ellerman, IBM Corporation
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/sched.h>	/* for idle_task_exit */
+#include <linux/cpu.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/xics.h>
+#include "plpar_wrappers.h"
+#include "offline_states.h"
+
+/* This version can't take the spinlock, because it never returns */
+static struct rtas_args rtas_stop_self_args = {
+	.token = RTAS_UNKNOWN_SERVICE,
+	.nargs = 0,
+	.nret = 1,
+	.rets = &rtas_stop_self_args.args[0],
+};
+
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+							CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static int cede_offline_enabled __read_mostly = 1;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+	if (!strcmp(str, "off"))
+		cede_offline_enabled = 0;
+	else if (!strcmp(str, "on"))
+		cede_offline_enabled = 1;
+	else
+		return 0;
+	return 1;
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+	return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+	per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
+
+static void rtas_stop_self(void)
+{
+	struct rtas_args *args = &rtas_stop_self_args;
+
+	local_irq_disable();
+
+	BUG_ON(args->token == RTAS_UNKNOWN_SERVICE);
+
+	printk("cpu %u (hwid %u) Ready to die...\n",
+	       smp_processor_id(), hard_smp_processor_id());
+	enter_rtas(__pa(args));
+
+	panic("Alas, I survived.\n");
+}
+
+static void pseries_mach_cpu_die(void)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned int hwcpu = hard_smp_processor_id();
+	u8 cede_latency_hint = 0;
+
+	local_irq_disable();
+	idle_task_exit();
+	xics_teardown_cpu();
+
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+		if (ppc_md.suspend_disable_cpu)
+			ppc_md.suspend_disable_cpu();
+
+		cede_latency_hint = 2;
+
+		get_lppaca()->idle = 1;
+		if (!get_lppaca()->shared_proc)
+			get_lppaca()->donate_dedicated_cpu = 1;
+
+		while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+			extended_cede_processor(cede_latency_hint);
+		}
+
+		if (!get_lppaca()->shared_proc)
+			get_lppaca()->donate_dedicated_cpu = 0;
+		get_lppaca()->idle = 0;
+
+		if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+			unregister_slb_shadow(hwcpu);
+
+			/*
+			 * Call to start_secondary_resume() will not return.
+			 * Kernel stack will be reset and start_secondary()
+			 * will be called to continue the online operation.
+			 */
+			start_secondary_resume();
+		}
+	}
+
+	/* Requested state is CPU_STATE_OFFLINE at this point */
+	WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
+
+	set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+	unregister_slb_shadow(hwcpu);
+	rtas_stop_self();
+
+	/* Should never get here... */
+	BUG();
+	for(;;);
+}
+
+static int pseries_cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	set_cpu_online(cpu, false);
+	vdso_data->processorCount--;
+
+	/*fix boot_cpuid here*/
+	if (cpu == boot_cpuid)
+		boot_cpuid = cpumask_any(cpu_online_mask);
+
+	/* FIXME: abstract this to not be platform specific later on */
+	xics_migrate_irqs_away();
+	return 0;
+}
+
+/*
+ * pseries_cpu_die: Wait for the cpu to die.
+ * @cpu: logical processor id of the CPU whose death we're awaiting.
+ *
+ * This function is called from the context of the thread which is performing
+ * the cpu-offline. Here we wait for long enough to allow the cpu in question
+ * to self-destroy so that the cpu-offline thread can send the CPU_DEAD
+ * notifications.
+ *
+ * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
+ * self-destruct.
+ */
+static void pseries_cpu_die(unsigned int cpu)
+{
+	int tries;
+	int cpu_status = 1;
+	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		cpu_status = 1;
+		for (tries = 0; tries < 5000; tries++) {
+			if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+				cpu_status = 0;
+				break;
+			}
+			msleep(1);
+		}
+	} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+		for (tries = 0; tries < 25; tries++) {
+			cpu_status = smp_query_cpu_stopped(pcpu);
+			if (cpu_status == QCSS_STOPPED ||
+			    cpu_status == QCSS_HARDWARE_ERROR)
+				break;
+			cpu_relax();
+		}
+	}
+
+	if (cpu_status != 0) {
+		printk("Querying DEAD? cpu %i (%i) shows %i\n",
+		       cpu, pcpu, cpu_status);
+	}
+
+	/* Isolation and deallocation are definitely done by
+	 * drslot_chrp_cpu.  If they were not they would be
+	 * done here.  Change isolate state to Isolate and
+	 * change allocation-state to Unusable.
+	 */
+	paca[cpu].cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_mask and paca(s) for a new cpu node.  The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case.  We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pseries_add_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	cpumask_var_t candidate_mask, tmp;
+	int err = -ENOSPC, len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return 0;
+
+	zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&tmp, GFP_KERNEL);
+
+	nthreads = len / sizeof(u32);
+	for (i = 0; i < nthreads; i++)
+		cpumask_set_cpu(i, tmp);
+
+	cpu_maps_update_begin();
+
+	BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+
+	/* Get a bitmap of unoccupied slots. */
+	cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+	if (cpumask_empty(candidate_mask)) {
+		/* If we get here, it most likely means that NR_CPUS is
+		 * less than the partition's max processors setting.
+		 */
+		printk(KERN_ERR "Cannot add cpu %s; this system configuration"
+		       " supports %d logical cpus.\n", np->full_name,
+		       cpumask_weight(cpu_possible_mask));
+		goto out_unlock;
+	}
+
+	while (!cpumask_empty(tmp))
+		if (cpumask_subset(tmp, candidate_mask))
+			/* Found a range where we can insert the new cpu(s) */
+			break;
+		else
+			cpumask_shift_left(tmp, tmp, nthreads);
+
+	if (cpumask_empty(tmp)) {
+		printk(KERN_ERR "Unable to find space in cpu_present_mask for"
+		       " processor %s with %d thread(s)\n", np->name,
+		       nthreads);
+		goto out_unlock;
+	}
+
+	for_each_cpu(cpu, tmp) {
+		BUG_ON(cpu_present(cpu));
+		set_cpu_present(cpu, true);
+		set_hard_smp_processor_id(cpu, *intserv++);
+	}
+	err = 0;
+out_unlock:
+	cpu_maps_update_done();
+	free_cpumask_var(candidate_mask);
+	free_cpumask_var(tmp);
+	return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pseries_remove_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	int len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+			BUG_ON(cpu_online(cpu));
+			set_cpu_present(cpu, false);
+			set_hard_smp_processor_id(cpu, -1);
+			break;
+		}
+		if (cpu >= nr_cpu_ids)
+			printk(KERN_WARNING "Could not find cpu to remove "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	cpu_maps_update_done();
+}
+
+static int pseries_smp_notifier(struct notifier_block *nb,
+				unsigned long action, void *node)
+{
+	int err = 0;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		err = pseries_add_processor(node);
+		break;
+	case PSERIES_RECONFIG_REMOVE:
+		pseries_remove_processor(node);
+		break;
+	}
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_smp_nb = {
+	.notifier_call = pseries_smp_notifier,
+};
+
+#define MAX_CEDE_LATENCY_LEVELS		4
+#define	CEDE_LATENCY_PARAM_LENGTH	10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH	\
+	(MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN		45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+	memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+	return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			 NULL,
+			 CEDE_LATENCY_TOKEN,
+			 __pa(cede_parameters),
+			 CEDE_LATENCY_PARAM_MAX_LENGTH);
+}
+
+static int __init pseries_cpu_hotplug_init(void)
+{
+	struct device_node *np;
+	const char *typep;
+	int cpu;
+	int qcss_tok;
+
+	for_each_node_by_name(np, "interrupt-controller") {
+		typep = of_get_property(np, "compatible", NULL);
+		if (strstr(typep, "open-pic")) {
+			of_node_put(np);
+
+			printk(KERN_INFO "CPU Hotplug not supported on "
+				"systems using MPIC\n");
+			return 0;
+		}
+	}
+
+	rtas_stop_self_args.token = rtas_token("stop-self");
+	qcss_tok = rtas_token("query-cpu-stopped-state");
+
+	if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE ||
+			qcss_tok == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_INFO "CPU Hotplug not supported by firmware "
+				"- disabling.\n");
+		return 0;
+	}
+
+	ppc_md.cpu_die = pseries_mach_cpu_die;
+	smp_ops->cpu_disable = pseries_cpu_disable;
+	smp_ops->cpu_die = pseries_cpu_die;
+
+	/* Processors can be added/removed only on LPAR */
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		pSeries_reconfig_notifier_register(&pseries_smp_nb);
+		cpu_maps_update_begin();
+		if (cede_offline_enabled && parse_cede_parameters() == 0) {
+			default_offline_state = CPU_STATE_INACTIVE;
+			for_each_online_cpu(cpu)
+				set_default_offline_state(cpu);
+		}
+		cpu_maps_update_done();
+	}
+
+	return 0;
+}
+arch_initcall(pseries_cpu_hotplug_init);
@@ -0,0 +1,236 @@
+/*
+ * pseries Memory Hotplug infrastructure.
+ *
+ * Copyright (C) 2008 Badari Pulavarty, IBM Corporation
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/sparsemem.h>
+
+static unsigned long get_memblock_size(void)
+{
+	struct device_node *np;
+	unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
+	struct resource r;
+
+	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (np) {
+		const __be64 *size;
+
+		size = of_get_property(np, "ibm,lmb-size", NULL);
+		if (size)
+			memblock_size = be64_to_cpup(size);
+		of_node_put(np);
+	} else  if (machine_is(pseries)) {
+		/* This fallback really only applies to pseries */
+		unsigned int memzero_size = 0;
+
+		np = of_find_node_by_path("/memory@0");
+		if (np) {
+			if (!of_address_to_resource(np, 0, &r))
+				memzero_size = resource_size(&r);
+			of_node_put(np);
+		}
+
+		if (memzero_size) {
+			/* We now know the size of memory@0, use this to find
+			 * the first memoryblock and get its size.
+			 */
+			char buf[64];
+
+			sprintf(buf, "/memory@%x", memzero_size);
+			np = of_find_node_by_path(buf);
+			if (np) {
+				if (!of_address_to_resource(np, 0, &r))
+					memblock_size = resource_size(&r);
+				of_node_put(np);
+			}
+		}
+	}
+	return memblock_size;
+}
+
+/* WARNING: This is going to override the generic definition whenever
+ * pseries is built-in regardless of what platform is active at boot
+ * time. This is fine for now as this is the only "option" and it
+ * should work everywhere. If not, we'll have to turn this into a
+ * ppc_md. callback
+ */
+unsigned long memory_block_size_bytes(void)
+{
+	return get_memblock_size();
+}
+
+static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+{
+	unsigned long start, start_pfn;
+	struct zone *zone;
+	int ret;
+
+	start_pfn = base >> PAGE_SHIFT;
+
+	if (!pfn_valid(start_pfn)) {
+		memblock_remove(base, memblock_size);
+		return 0;
+	}
+
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	/*
+	 * Remove section mappings and sysfs entries for the
+	 * section of the memory we are removing.
+	 *
+	 * NOTE: Ideally, this should be done in generic code like
+	 * remove_memory(). But remove_memory() gets called by writing
+	 * to sysfs "state" file and we can't remove sysfs entries
+	 * while writing to it. So we have to defer it to here.
+	 */
+	ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	/*
+	 * Update memory regions for memory remove
+	 */
+	memblock_remove(base, memblock_size);
+
+	/*
+	 * Remove htab bolted mappings for this section of memory
+	 */
+	start = (unsigned long)__va(base);
+	ret = remove_section_mapping(start, start + memblock_size);
+
+	/* Ensure all vmalloc mappings are flushed in case they also
+	 * hit that section of memory
+	 */
+	vm_unmap_aliases();
+
+	return ret;
+}
+
+static int pseries_remove_memory(struct device_node *np)
+{
+	const char *type;
+	const unsigned int *regs;
+	unsigned long base;
+	unsigned int lmb_size;
+	int ret = -EINVAL;
+
+	/*
+	 * Check to see if we are actually removing memory
+	 */
+	type = of_get_property(np, "device_type", NULL);
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/*
+	 * Find the bae address and size of the memblock
+	 */
+	regs = of_get_property(np, "reg", NULL);
+	if (!regs)
+		return ret;
+
+	base = *(unsigned long *)regs;
+	lmb_size = regs[3];
+
+	ret = pseries_remove_memblock(base, lmb_size);
+	return ret;
+}
+
+static int pseries_add_memory(struct device_node *np)
+{
+	const char *type;
+	const unsigned int *regs;
+	unsigned long base;
+	unsigned int lmb_size;
+	int ret = -EINVAL;
+
+	/*
+	 * Check to see if we are actually adding memory
+	 */
+	type = of_get_property(np, "device_type", NULL);
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/*
+	 * Find the base and size of the memblock
+	 */
+	regs = of_get_property(np, "reg", NULL);
+	if (!regs)
+		return ret;
+
+	base = *(unsigned long *)regs;
+	lmb_size = regs[3];
+
+	/*
+	 * Update memory region to represent the memory add
+	 */
+	ret = memblock_add(base, lmb_size);
+	return (ret < 0) ? -EINVAL : 0;
+}
+
+static int pseries_drconf_memory(unsigned long *base, unsigned int action)
+{
+	unsigned long memblock_size;
+	int rc;
+
+	memblock_size = get_memblock_size();
+	if (!memblock_size)
+		return -EINVAL;
+
+	if (action == PSERIES_DRCONF_MEM_ADD) {
+		rc = memblock_add(*base, memblock_size);
+		rc = (rc < 0) ? -EINVAL : 0;
+	} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
+		rc = pseries_remove_memblock(*base, memblock_size);
+	} else {
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static int pseries_memory_notifier(struct notifier_block *nb,
+				unsigned long action, void *node)
+{
+	int err = 0;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		err = pseries_add_memory(node);
+		break;
+	case PSERIES_RECONFIG_REMOVE:
+		err = pseries_remove_memory(node);
+		break;
+	case PSERIES_DRCONF_MEM_ADD:
+	case PSERIES_DRCONF_MEM_REMOVE:
+		err = pseries_drconf_memory(node, action);
+		break;
+	}
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_mem_nb = {
+	.notifier_call = pseries_memory_notifier,
+};
+
+static int __init pseries_memory_hotplug_init(void)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		pSeries_reconfig_notifier_register(&pseries_mem_nb);
+
+	return 0;
+}
+machine_device_initcall(pseries, pseries_memory_hotplug_init);
@@ -0,0 +1,270 @@
+/*
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+	
+#define STK_PARM(i)     (48 + ((i)-3)*8)
+
+#ifdef CONFIG_TRACEPOINTS
+
+	.section	".toc","aw"
+
+	.globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+	.llong	0
+
+	.section	".text"
+
+/*
+ * precall must preserve all registers.  use unused STK_PARM()
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
+ */
+#define HCALL_INST_PRECALL(FIRST_REG)				\
+BEGIN_FTR_SECTION;						\
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld      r12,hcall_tracepoint_refcount@toc(r2);		\
+	std	r12,32(r1);					\
+	cmpdi	r12,0;						\
+	beq+	1f;						\
+	mflr	r0;						\
+	std	r3,STK_PARM(r3)(r1);				\
+	std	r4,STK_PARM(r4)(r1);				\
+	std	r5,STK_PARM(r5)(r1);				\
+	std	r6,STK_PARM(r6)(r1);				\
+	std	r7,STK_PARM(r7)(r1);				\
+	std	r8,STK_PARM(r8)(r1);				\
+	std	r9,STK_PARM(r9)(r1);				\
+	std	r10,STK_PARM(r10)(r1);				\
+	std	r0,16(r1);					\
+	addi	r4,r1,STK_PARM(FIRST_REG);			\
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1);			\
+	bl	.__trace_hcall_entry;				\
+	addi	r1,r1,STACK_FRAME_OVERHEAD;			\
+	ld	r0,16(r1);					\
+	ld	r3,STK_PARM(r3)(r1);				\
+	ld	r4,STK_PARM(r4)(r1);				\
+	ld	r5,STK_PARM(r5)(r1);				\
+	ld	r6,STK_PARM(r6)(r1);				\
+	ld	r7,STK_PARM(r7)(r1);				\
+	ld	r8,STK_PARM(r8)(r1);				\
+	ld	r9,STK_PARM(r9)(r1);				\
+	ld	r10,STK_PARM(r10)(r1);				\
+	mtlr	r0;						\
+1:
+
+/*
+ * postcall is performed immediately before function return which
+ * allows liberal use of volatile registers.  We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
+ */
+#define __HCALL_INST_POSTCALL					\
+BEGIN_FTR_SECTION;						\
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld      r12,32(r1);					\
+	cmpdi	r12,0;						\
+	beq+	1f;						\
+	mflr	r0;						\
+	ld	r6,STK_PARM(r3)(r1);				\
+	std	r3,STK_PARM(r3)(r1);				\
+	mr	r4,r3;						\
+	mr	r3,r6;						\
+	std	r0,16(r1);					\
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1);			\
+	bl	.__trace_hcall_exit;				\
+	addi	r1,r1,STACK_FRAME_OVERHEAD;			\
+	ld	r0,16(r1);					\
+	ld	r3,STK_PARM(r3)(r1);				\
+	mtlr	r0;						\
+1:
+
+#define HCALL_INST_POSTCALL_NORETS				\
+	li	r5,0;						\
+	__HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)				\
+	mr	r5,BUFREG;					\
+	__HCALL_INST_POSTCALL
+
+#else
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
+#endif
+
+	.text
+
+_GLOBAL(plpar_hcall_norets)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HCALL_INST_PRECALL(r4)
+
+	HVSC				/* invoke the hypervisor */
+
+	HCALL_INST_POSTCALL_NORETS
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+_GLOBAL(plpar_hcall)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HCALL_INST_PRECALL(r5)
+
+	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	HVSC				/* invoke the hypervisor */
+
+	ld	r12,STK_PARM(r4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+
+	HCALL_INST_POSTCALL(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+/*
+ * plpar_hcall_raw can be called in real mode. kexec/kdump need some
+ * hypervisor calls to be executed in real mode. So plpar_hcall_raw
+ * does not access the per cpu hypervisor call statistics variables,
+ * since these variables may not be present in the RMO region.
+ */
+_GLOBAL(plpar_hcall_raw)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	HVSC				/* invoke the hypervisor */
+
+	ld	r12,STK_PARM(r4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+_GLOBAL(plpar_hcall9)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HCALL_INST_PRECALL(r5)
+
+	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STK_PARM(r11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARM(r12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARM(r13)(r1)    /* put arg9 in R12 */
+
+	HVSC				/* invoke the hypervisor */
+
+	mr	r0,r12
+	ld	r12,STK_PARM(r4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+	std	r8, 32(r12)
+	std	r9, 40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0, 64(r12)
+
+	HCALL_INST_POSTCALL(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+/* See plpar_hcall_raw to see why this is needed */
+_GLOBAL(plpar_hcall9_raw)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STK_PARM(r11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARM(r12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARM(r13)(r1)    /* put arg9 in R12 */
+
+	HVSC				/* invoke the hypervisor */
+
+	mr	r0,r12
+	ld	r12,STK_PARM(r4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+	std	r8, 32(r12)
+	std	r9, 40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0, 64(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2006 Mike Kravetz IBM Corporation
+ *
+ * Hypervisor Call Instrumentation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/cputable.h>
+#include <asm/trace.h>
+
+DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
+
+/*
+ * Routines for displaying the statistics in debugfs
+ */
+static void *hc_start(struct seq_file *m, loff_t *pos)
+{
+	if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
+		return (void *)(unsigned long)(*pos + 1);
+
+	return NULL;
+}
+
+static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	++*pos;
+
+	return hc_start(m, pos);
+}
+
+static void hc_stop(struct seq_file *m, void *p)
+{
+}
+
+static int hc_show(struct seq_file *m, void *p)
+{
+	unsigned long h_num = (unsigned long)p;
+	struct hcall_stats *hs = m->private;
+
+	if (hs[h_num].num_calls) {
+		if (cpu_has_feature(CPU_FTR_PURR))
+			seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
+				   hs[h_num].num_calls,
+				   hs[h_num].tb_total,
+				   hs[h_num].purr_total);
+		else
+			seq_printf(m, "%lu %lu %lu\n", h_num<<2,
+				   hs[h_num].num_calls,
+				   hs[h_num].tb_total);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations hcall_inst_seq_ops = {
+        .start = hc_start,
+        .next  = hc_next,
+        .stop  = hc_stop,
+        .show  = hc_show
+};
+
+static int hcall_inst_seq_open(struct inode *inode, struct file *file)
+{
+	int rc;
+	struct seq_file *seq;
+
+	rc = seq_open(file, &hcall_inst_seq_ops);
+	seq = file->private_data;
+	seq->private = file->f_path.dentry->d_inode->i_private;
+
+	return rc;
+}
+
+static const struct file_operations hcall_inst_seq_fops = {
+	.open = hcall_inst_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#define	HCALL_ROOT_DIR		"hcall_inst"
+#define CPU_NAME_BUF_SIZE	32
+
+
+static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = &__get_cpu_var(hcall_stats)[opcode / 4];
+	h->tb_start = mftb();
+	h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval,
+			     unsigned long *retbuf)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = &__get_cpu_var(hcall_stats)[opcode / 4];
+	h->num_calls++;
+	h->tb_total += mftb() - h->tb_start;
+	h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
+}
+
+static int __init hcall_inst_init(void)
+{
+	struct dentry *hcall_root;
+	struct dentry *hcall_file;
+	char cpu_name_buf[CPU_NAME_BUF_SIZE];
+	int cpu;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	if (register_trace_hcall_entry(probe_hcall_entry, NULL))
+		return -EINVAL;
+
+	if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
+		unregister_trace_hcall_entry(probe_hcall_entry, NULL);
+		return -EINVAL;
+	}
+
+	hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
+	if (!hcall_root)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
+		hcall_file = debugfs_create_file(cpu_name_buf, S_IRUGO,
+						 hcall_root,
+						 per_cpu(hcall_stats, cpu),
+						 &hcall_inst_seq_fops);
+		if (!hcall_file)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+__initcall(hcall_inst_init);
@@ -0,0 +1,81 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ *  Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+#include "plpar_wrappers.h"
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ *	data.
+ * @buf: The character buffer into which to put the character data fetched from
+ *	firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+	unsigned long got;
+
+	if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS)
+		return got;
+
+	return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ *	originated.
+ * @buf: The character buffer that contains the character data to send to
+ *	firmware.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+	unsigned long *lbuf = (unsigned long *) buf;
+	long ret;
+
+
+	/* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
+	if (count > MAX_VIO_PUT_CHARS)
+		count = MAX_VIO_PUT_CHARS;
+
+	ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
+				 lbuf[1]);
+	if (ret == H_SUCCESS)
+		return count;
+	if (ret == H_BUSY)
+		return -EAGAIN;
+	return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
@@ -0,0 +1,251 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos.  The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+static int hvcs_convert(long to_convert)
+{
+	switch (to_convert) {
+		case H_SUCCESS:
+			return 0;
+		case H_PARAMETER:
+			return -EINVAL;
+		case H_HARDWARE:
+			return -EIO;
+		case H_BUSY:
+		case H_LONG_BUSY_ORDER_1_MSEC:
+		case H_LONG_BUSY_ORDER_10_MSEC:
+		case H_LONG_BUSY_ORDER_100_MSEC:
+		case H_LONG_BUSY_ORDER_1_SEC:
+		case H_LONG_BUSY_ORDER_10_SEC:
+		case H_LONG_BUSY_ORDER_100_SEC:
+			return -EBUSY;
+		case H_FUNCTION: /* fall through */
+		default:
+			return -EPERM;
+	}
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ *	free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+	struct hvcs_partner_info *pi;
+	struct list_head *element;
+
+	if (!head)
+		return -EINVAL;
+
+	while (!list_empty(head)) {
+		element = head->next;
+		pi = list_entry(element, struct hvcs_partner_info, node);
+		list_del(element);
+		kfree(pi);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+static int hvcs_next_partner(uint32_t unit_address,
+		unsigned long last_p_partition_ID,
+		unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+	long retval;
+	retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+			last_p_partition_ID,
+				last_p_unit_address, virt_to_phys(pi_buff));
+	return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ *	function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ *	list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ *	that is to be used to be used by firmware as an iterator to keep track
+ *	of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address.  The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+		unsigned long *pi_buff)
+{
+	/*
+	 * Dealt with as longs because of the hcall interface even though the
+	 * values are uint32_t.
+	 */
+	unsigned long	last_p_partition_ID;
+	unsigned long	last_p_unit_address;
+	struct hvcs_partner_info *next_partner_info = NULL;
+	int more = 1;
+	int retval;
+
+	memset(pi_buff, 0x00, PAGE_SIZE);
+	/* invalid parameters */
+	if (!head || !pi_buff)
+		return -EINVAL;
+
+	last_p_partition_ID = last_p_unit_address = ~0UL;
+	INIT_LIST_HEAD(head);
+
+	do {
+		retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+				last_p_unit_address, pi_buff);
+		if (retval) {
+			/*
+			 * Don't indicate that we've failed if we have
+			 * any list elements.
+			 */
+			if (!list_empty(head))
+				return 0;
+			return retval;
+		}
+
+		last_p_partition_ID = pi_buff[0];
+		last_p_unit_address = pi_buff[1];
+
+		/* This indicates that there are no further partners */
+		if (last_p_partition_ID == ~0UL
+				&& last_p_unit_address == ~0UL)
+			break;
+
+		/* This is a very small struct and will be freed soon in
+		 * hvcs_free_partner_info(). */
+		next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+				GFP_ATOMIC);
+
+		if (!next_partner_info) {
+			printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+				" allocate partner info struct.\n");
+			hvcs_free_partner_info(head);
+			return -ENOMEM;
+		}
+
+		next_partner_info->unit_address
+			= (unsigned int)last_p_unit_address;
+		next_partner_info->partition_ID
+			= (unsigned int)last_p_partition_ID;
+
+		/* copy the Null-term char too */
+		strncpy(&next_partner_info->location_code[0],
+			(char *)&pi_buff[2],
+			strlen((char *)&pi_buff[2]) + 1);
+
+		list_add_tail(&(next_partner_info->node), head);
+		next_partner_info = NULL;
+
+	} while (more);
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ *	a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ *	establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ *	is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again.  If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter.  It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open.  Don't shoot the
+ * messenger.  Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+		uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+				p_partition_ID, p_unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ *	connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/irq.h>
+#include <asm/io_event_irq.h>
+
+#include "pseries.h"
+
+/*
+ * IO event interrupt is a mechanism provided by RTAS to return
+ * information about hardware error and non-error events. Device
+ * drivers can register their event handlers to receive events.
+ * Device drivers are expected to use atomic_notifier_chain_register()
+ * and atomic_notifier_chain_unregister() to register and unregister
+ * their event handlers. Since multiple IO event types and scopes
+ * share an IO event interrupt, the event handlers are called one
+ * by one until the IO event is claimed by one of the handlers.
+ * The event handlers are expected to return NOTIFY_OK if the
+ * event is handled by the event handler or NOTIFY_DONE if the
+ * event does not belong to the handler.
+ *
+ * Usage:
+ *
+ * Notifier function:
+ * #include <asm/io_event_irq.h>
+ * int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
+ * 	p = (struct pseries_io_event_sect_data *) data;
+ * 	if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
+ * 		:
+ * 		:
+ * 	return NOTIFY_OK;
+ * }
+ * struct notifier_block event_nb = {
+ * 	.notifier_call = event_handler,
+ * }
+ *
+ * Registration:
+ * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
+ *
+ * Unregistration:
+ * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
+ */
+
+ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
+EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
+
+static int ioei_check_exception_token;
+
+static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+
+/**
+ * Find the data portion of an IO Event section from event log.
+ * @elog: RTAS error/event log.
+ *
+ * Return:
+ * 	pointer to a valid IO event section data. NULL if not found.
+ */
+static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
+{
+	struct pseries_errorlog *sect;
+
+	/* We should only ever get called for io-event interrupts, but if
+	 * we do get called for another type then something went wrong so
+	 * make some noise about it.
+	 * RTAS_TYPE_IO only exists in extended event log version 6 or later.
+	 * No need to check event log version.
+	 */
+	if (unlikely(elog->type != RTAS_TYPE_IO)) {
+		printk_once(KERN_WARNING "io_event_irq: Unexpected event type %d",
+			    elog->type);
+		return NULL;
+	}
+
+	sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
+	if (unlikely(!sect)) {
+		printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
+			    "log does not contain an IO Event section. "
+			    "Could be a bug in system firmware!\n");
+		return NULL;
+	}
+	return (struct pseries_io_event *) &sect->data;
+}
+
+/*
+ * PAPR:
+ * - check-exception returns the first found error or event and clear that
+ *   error or event so it is reported once.
+ * - Each interrupt returns one event. If a plateform chooses to report
+ *   multiple events through a single interrupt, it must ensure that the
+ *   interrupt remains asserted until check-exception has been used to
+ *   process all out-standing events for that interrupt.
+ *
+ * Implementation notes:
+ * - Events must be processed in the order they are returned. Hence,
+ *   sequential in nature.
+ * - The owner of an event is determined by combinations of scope,
+ *   event type, and sub-type. There is no easy way to pre-sort clients
+ *   by scope or event type alone. For example, Torrent ISR route change
+ *   event is reported with scope 0x00 (Not Applicatable) rather than
+ *   0x3B (Torrent-hub). It is better to let the clients to identify
+ *   who owns the the event.
+ */
+
+static irqreturn_t ioei_interrupt(int irq, void *dev_id)
+{
+	struct pseries_io_event *event;
+	int rtas_rc;
+
+	for (;;) {
+		rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
+				    RTAS_VECTOR_EXTERNAL_INTERRUPT,
+				    virq_to_hw(irq),
+				    RTAS_IO_EVENTS, 1 /* Time Critical */,
+				    __pa(ioei_rtas_buf),
+				    RTAS_DATA_BUF_SIZE);
+		if (rtas_rc != 0)
+			break;
+
+		event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
+		if (!event)
+			continue;
+
+		atomic_notifier_call_chain(&pseries_ioei_notifier_list,
+					   0, event);
+	}
+	return IRQ_HANDLED;
+}
+
+static int __init ioei_init(void)
+{
+	struct device_node *np;
+
+	ioei_check_exception_token = rtas_token("check-exception");
+	if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	np = of_find_node_by_path("/event-sources/ibm,io-events");
+	if (np) {
+		request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
+		pr_info("IBM I/O event interrupts enabled\n");
+		of_node_put(np);
+	} else {
+		return -ENODEV;
+	}
+	return 0;
+}
+machine_subsys_initcall(pseries, ioei_init);
+
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/firmware.h>
+#include <asm/kexec.h>
+#include <asm/mpic.h>
+#include <asm/xics.h>
+#include <asm/smp.h>
+
+#include "pseries.h"
+#include "plpar_wrappers.h"
+
+static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+	/* Don't risk a hypervisor call if we're crashing */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
+		int ret;
+		int cpu = smp_processor_id();
+		int hwcpu = hard_smp_processor_id();
+
+		if (get_lppaca()->dtl_enable_mask) {
+			ret = unregister_dtl(hwcpu);
+			if (ret) {
+				pr_err("WARNING: DTL deregistration for cpu "
+				       "%d (hw %d) failed with %d\n",
+				       cpu, hwcpu, ret);
+			}
+		}
+
+		ret = unregister_slb_shadow(hwcpu);
+		if (ret) {
+			pr_err("WARNING: SLB shadow buffer deregistration "
+			       "for cpu %d (hw %d) failed with %d\n",
+			       cpu, hwcpu, ret);
+		}
+
+		ret = unregister_vpa(hwcpu);
+		if (ret) {
+			pr_err("WARNING: VPA deregistration for cpu %d "
+			       "(hw %d) failed with %d\n", cpu, hwcpu, ret);
+		}
+	}
+}
+
+static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary)
+{
+	pseries_kexec_cpu_down(crash_shutdown, secondary);
+	mpic_teardown_this_cpu(secondary);
+}
+
+void __init setup_kexec_cpu_down_mpic(void)
+{
+	ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic;
+}
+
+static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary)
+{
+	pseries_kexec_cpu_down(crash_shutdown, secondary);
+	xics_kexec_teardown_cpu(secondary);
+}
+
+void __init setup_kexec_cpu_down_xics(void)
+{
+	ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
+}
@@ -0,0 +1,641 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+/* Enables debugging of low-level hash table routines - careful! */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/mmu_context.h>
+#include <asm/iommu.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/trace.h>
+#include <asm/firmware.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+
+/* in hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall9);
+EXPORT_SYMBOL(plpar_hcall_norets);
+
+extern void pSeries_find_serial_port(void);
+
+void vpa_init(int cpu)
+{
+	int hwcpu = get_hard_smp_processor_id(cpu);
+	unsigned long addr;
+	long ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		lppaca_of(cpu).vmxregs_in_use = 1;
+
+	addr = __pa(&lppaca_of(cpu));
+	ret = register_vpa(hwcpu, addr);
+
+	if (ret) {
+		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
+		return;
+	}
+	/*
+	 * PAPR says this feature is SLB-Buffer but firmware never
+	 * reports that.  All SPLPAR support SLB shadow buffer.
+	 */
+	addr = __pa(&slb_shadow[cpu]);
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		ret = register_slb_shadow(hwcpu, addr);
+		if (ret)
+			pr_err("WARNING: SLB shadow buffer registration for "
+			       "cpu %d (hw %d) of area %lx failed with %ld\n",
+			       cpu, hwcpu, addr, ret);
+	}
+
+	/*
+	 * Register dispatch trace log, if one has been allocated.
+	 */
+	pp = &paca[cpu];
+	dtl = pp->dispatch_log;
+	if (dtl) {
+		pp->dtl_ridx = 0;
+		pp->dtl_curr = dtl;
+		lppaca_of(cpu).dtl_idx = 0;
+
+		/* hypervisor reads buffer length from this field */
+		dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
+		ret = register_dtl(hwcpu, __pa(dtl));
+		if (ret)
+			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+			       "failed with %ld\n", smp_processor_id(),
+			       hwcpu, ret);
+		lppaca_of(cpu).dtl_enable_mask = 2;
+	}
+}
+
+static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+ 			      unsigned long va, unsigned long pa,
+ 			      unsigned long rflags, unsigned long vflags,
+			      int psize, int ssize)
+{
+	unsigned long lpar_rc;
+	unsigned long flags;
+	unsigned long slot;
+	unsigned long hpte_v, hpte_r;
+
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
+			 "rflags=%lx, vflags=%lx, psize=%d)\n",
+			 hpte_group, va, pa, rflags, vflags, psize);
+
+	hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+	/* Now fill in the actual HPTE */
+	/* Set CEC cookie to 0         */
+	/* Zero page = 0               */
+	/* I-cache Invalidate = 0      */
+	/* I-cache synchronize = 0     */
+	/* Exact = 0                   */
+	flags = 0;
+
+	/* Make pHyp happy */
+	if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU))
+		hpte_r &= ~_PAGE_COHERENT;
+	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
+		flags |= H_COALESCE_CAND;
+
+	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
+	if (unlikely(lpar_rc == H_PTEG_FULL)) {
+		if (!(vflags & HPTE_V_BOLTED))
+			pr_devel(" full\n");
+		return -1;
+	}
+
+	/*
+	 * Since we try and ioremap PHBs we don't own, the pte insert
+	 * will fail. However we must catch the failure in hash_page
+	 * or we will loop forever, so return -2 in this case.
+	 */
+	if (unlikely(lpar_rc != H_SUCCESS)) {
+		if (!(vflags & HPTE_V_BOLTED))
+			pr_devel(" lpar err %lu\n", lpar_rc);
+		return -2;
+	}
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel(" -> slot: %lu\n", slot & 7);
+
+	/* Because of iSeries, we have to pass down the secondary
+	 * bucket bit here as well
+	 */
+	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+	unsigned long slot_offset;
+	unsigned long lpar_rc;
+	int i;
+	unsigned long dummy1, dummy2;
+
+	/* pick a random slot to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+		/* don't remove a bolted entry */
+		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+					   (0x1UL << 4), &dummy1, &dummy2);
+		if (lpar_rc == H_SUCCESS)
+			return i;
+		BUG_ON(lpar_rc != H_NOT_FOUND);
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	return -1;
+}
+
+static void pSeries_lpar_hptab_clear(void)
+{
+	unsigned long size_bytes = 1UL << ppc64_pft_size;
+	unsigned long hpte_count = size_bytes >> 4;
+	struct {
+		unsigned long pteh;
+		unsigned long ptel;
+	} ptes[4];
+	long lpar_rc;
+	unsigned long i, j;
+
+	/* Read in batches of 4,
+	 * invalidate only valid entries not in the VRMA
+	 * hpte_count will be a multiple of 4
+         */
+	for (i = 0; i < hpte_count; i += 4) {
+		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
+		if (lpar_rc != H_SUCCESS)
+			continue;
+		for (j = 0; j < 4; j++){
+			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
+				HPTE_V_VRMA_MASK)
+				continue;
+			if (ptes[j].pteh & HPTE_V_VALID)
+				plpar_pte_remove_raw(0, i + j, 0,
+					&(ptes[j].pteh), &(ptes[j].ptel));
+		}
+	}
+}
+
+/*
+ * This computes the AVPN and B fields of the first dword of a HPTE,
+ * for use when we want to match an existing PTE.  The bottom 7 bits
+ * of the returned value are zero.
+ */
+static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
+					     int ssize)
+{
+	unsigned long v;
+
+	v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
+	v <<= HPTE_V_AVPN_SHIFT;
+	v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+	return v;
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up.  So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero.  For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+				       unsigned long newpp,
+				       unsigned long va,
+				       int psize, int ssize, int local)
+{
+	unsigned long lpar_rc;
+	unsigned long flags = (newpp & 7) | H_AVPN;
+	unsigned long want_v;
+
+	want_v = hpte_encode_avpn(va, psize, ssize);
+
+	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
+		 want_v, slot, flags, psize);
+
+	lpar_rc = plpar_pte_protect(flags, slot, want_v);
+
+	if (lpar_rc == H_NOT_FOUND) {
+		pr_devel("not found !\n");
+		return -1;
+	}
+
+	pr_devel("ok\n");
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+
+	return 0;
+}
+
+static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+{
+	unsigned long dword0;
+	unsigned long lpar_rc;
+	unsigned long dummy_word1;
+	unsigned long flags;
+
+	/* Read 1 pte at a time                        */
+	/* Do not need RPN to logical page translation */
+	/* No cross CEC PFT access                     */
+	flags = 0;
+
+	lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+
+	return dword0;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize)
+{
+	unsigned long hash;
+	unsigned long i;
+	long slot;
+	unsigned long want_v, hpte_v;
+
+	hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(va, psize, ssize);
+
+	/* Bolted entries are always in the primary group */
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		hpte_v = pSeries_lpar_hpte_getword0(slot);
+
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+			/* HPTE matches */
+			return slot;
+		++slot;
+	}
+
+	return -1;
+} 
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+					     unsigned long ea,
+					     int psize, int ssize)
+{
+	unsigned long lpar_rc, slot, vsid, va, flags;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	va = hpt_va(ea, vsid, ssize);
+
+	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	BUG_ON(slot == -1);
+
+	flags = newpp & 7;
+	lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+					 int psize, int ssize, int local)
+{
+	unsigned long want_v;
+	unsigned long lpar_rc;
+	unsigned long dummy1, dummy2;
+
+	pr_devel("    inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
+		 slot, va, psize, local);
+
+	want_v = hpte_encode_avpn(va, psize, ssize);
+	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
+	if (lpar_rc == H_NOT_FOUND)
+		return;
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_removebolted(unsigned long ea,
+					   int psize, int ssize)
+{
+	unsigned long slot, vsid, va;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	va = hpt_va(ea, vsid, ssize);
+
+	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	BUG_ON(slot == -1);
+
+	pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0);
+}
+
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST	0x4000000000000000UL
+#define HBR_RESPONSE	0x8000000000000000UL
+#define HBR_END		0xc000000000000000UL
+#define HBR_AVPN	0x0200000000000000UL
+#define HBR_ANDCOND	0x0100000000000000UL
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+	unsigned long i, pix, rc;
+	unsigned long flags = 0;
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	unsigned long param[9];
+	unsigned long va;
+	unsigned long hash, index, shift, hidx, slot;
+	real_pte_t pte;
+	int psize, ssize;
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	psize = batch->psize;
+	ssize = batch->ssize;
+	pix = 0;
+	for (i = 0; i < number; i++) {
+		va = batch->vaddr[i];
+		pte = batch->pte[i];
+		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+			hash = hpt_hash(va, shift, ssize);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+				pSeries_lpar_hpte_invalidate(slot, va, psize,
+							     ssize, local);
+			} else {
+				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
+				param[pix+1] = hpte_encode_avpn(va, psize,
+								ssize);
+				pix += 2;
+				if (pix == 8) {
+					rc = plpar_hcall9(H_BULK_REMOVE, param,
+						param[0], param[1], param[2],
+						param[3], param[4], param[5],
+						param[6], param[7]);
+					BUG_ON(rc != H_SUCCESS);
+					pix = 0;
+				}
+			}
+		} pte_iterate_hashed_end();
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static int __init disable_bulk_remove(char *str)
+{
+	if (strcmp(str, "off") == 0 &&
+	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			printk(KERN_INFO "Disabling BULK_REMOVE firmware feature");
+			powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
+	}
+	return 1;
+}
+
+__setup("bulk_remove=", disable_bulk_remove);
+
+void __init hpte_init_lpar(void)
+{
+	ppc_md.hpte_invalidate	= pSeries_lpar_hpte_invalidate;
+	ppc_md.hpte_updatepp	= pSeries_lpar_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= pSeries_lpar_hpte_insert;
+	ppc_md.hpte_remove	= pSeries_lpar_hpte_remove;
+	ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
+	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
+	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+#define CMO_FREE_HINT_DEFAULT 1
+static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
+
+static int __init cmo_free_hint(char *str)
+{
+	char *parm;
+	parm = strstrip(str);
+
+	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
+		printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n");
+		cmo_free_hint_flag = 0;
+		return 1;
+	}
+
+	cmo_free_hint_flag = 1;
+	printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n");
+
+	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
+		return 1;
+
+	return 0;
+}
+
+__setup("cmo_free_hint=", cmo_free_hint);
+
+static void pSeries_set_page_state(struct page *page, int order,
+				   unsigned long state)
+{
+	int i, j;
+	unsigned long cmo_page_sz, addr;
+
+	cmo_page_sz = cmo_get_page_size();
+	addr = __pa((unsigned long)page_address(page));
+
+	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
+		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
+			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
+	}
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
+}
+EXPORT_SYMBOL(arch_free_page);
+
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+/* 
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
+void hcall_tracepoint_regfunc(void)
+{
+	hcall_tracepoint_refcount++;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+	hcall_tracepoint_refcount--;
+}
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	/*
+	 * We cannot call tracepoints inside RCU idle regions which
+	 * means we must not trace H_CEDE.
+	 */
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_hcall_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_hcall_exit(long opcode, unsigned long retval,
+			unsigned long *retbuf)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_hcall_exit(opcode, retval, retbuf);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+#endif
+
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
+int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
+
+	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
+
+	mpp_x_data->coalesced_bytes = retbuf[0];
+	mpp_x_data->pool_coalesced_bytes = retbuf[1];
+	mpp_x_data->pool_purr_cycles = retbuf[2];
+	mpp_x_data->pool_spurr_cycles = retbuf[3];
+
+	return rc;
+}
@@ -0,0 +1,363 @@
+/*
+ * Support for Partition Mobility/Migration
+ *
+ * Copyright (C) 2010 Nathan Fontenot
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include <asm/rtas.h>
+#include "pseries.h"
+
+static struct kobject *mobility_kobj;
+
+struct update_props_workarea {
+	u32 phandle;
+	u32 state;
+	u64 reserved;
+	u32 nprops;
+};
+
+#define NODE_ACTION_MASK	0xff000000
+#define NODE_COUNT_MASK		0x00ffffff
+
+#define DELETE_DT_NODE	0x01000000
+#define UPDATE_DT_NODE	0x02000000
+#define ADD_DT_NODE	0x03000000
+
+static int mobility_rtas_call(int token, char *buf)
+{
+	int rc;
+
+	spin_lock(&rtas_data_buf_lock);
+
+	memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+	memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+	spin_unlock(&rtas_data_buf_lock);
+	return rc;
+}
+
+static int delete_dt_node(u32 phandle)
+{
+	struct device_node *dn;
+
+	dn = of_find_node_by_phandle(phandle);
+	if (!dn)
+		return -ENOENT;
+
+	dlpar_detach_node(dn);
+	return 0;
+}
+
+static int update_dt_property(struct device_node *dn, struct property **prop,
+			      const char *name, u32 vd, char *value)
+{
+	struct property *new_prop = *prop;
+	struct property *old_prop;
+	int more = 0;
+
+	/* A negative 'vd' value indicates that only part of the new property
+	 * value is contained in the buffer and we need to call
+	 * ibm,update-properties again to get the rest of the value.
+	 *
+	 * A negative value is also the two's compliment of the actual value.
+	 */
+	if (vd & 0x80000000) {
+		vd = ~vd + 1;
+		more = 1;
+	}
+
+	if (new_prop) {
+		/* partial property fixup */
+		char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
+		if (!new_data)
+			return -ENOMEM;
+
+		memcpy(new_data, new_prop->value, new_prop->length);
+		memcpy(new_data + new_prop->length, value, vd);
+
+		kfree(new_prop->value);
+		new_prop->value = new_data;
+		new_prop->length += vd;
+	} else {
+		new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+		if (!new_prop)
+			return -ENOMEM;
+
+		new_prop->name = kstrdup(name, GFP_KERNEL);
+		if (!new_prop->name) {
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		new_prop->length = vd;
+		new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+		if (!new_prop->value) {
+			kfree(new_prop->name);
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		memcpy(new_prop->value, value, vd);
+		*prop = new_prop;
+	}
+
+	if (!more) {
+		old_prop = of_find_property(dn, new_prop->name, NULL);
+		if (old_prop)
+			prom_update_property(dn, new_prop, old_prop);
+		else
+			prom_add_property(dn, new_prop);
+
+		new_prop = NULL;
+	}
+
+	return 0;
+}
+
+static int update_dt_node(u32 phandle)
+{
+	struct update_props_workarea *upwa;
+	struct device_node *dn;
+	struct property *prop = NULL;
+	int i, rc;
+	char *prop_data;
+	char *rtas_buf;
+	int update_properties_token;
+
+	update_properties_token = rtas_token("ibm,update-properties");
+	if (update_properties_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	dn = of_find_node_by_phandle(phandle);
+	if (!dn) {
+		kfree(rtas_buf);
+		return -ENOENT;
+	}
+
+	upwa = (struct update_props_workarea *)&rtas_buf[0];
+	upwa->phandle = phandle;
+
+	do {
+		rc = mobility_rtas_call(update_properties_token, rtas_buf);
+		if (rc < 0)
+			break;
+
+		prop_data = rtas_buf + sizeof(*upwa);
+
+		for (i = 0; i < upwa->nprops; i++) {
+			char *prop_name;
+			u32 vd;
+
+			prop_name = prop_data + 1;
+			prop_data += strlen(prop_name) + 1;
+			vd = *prop_data++;
+
+			switch (vd) {
+			case 0x00000000:
+				/* name only property, nothing to do */
+				break;
+
+			case 0x80000000:
+				prop = of_find_property(dn, prop_name, NULL);
+				prom_remove_property(dn, prop);
+				prop = NULL;
+				break;
+
+			default:
+				rc = update_dt_property(dn, &prop, prop_name,
+							vd, prop_data);
+				if (rc) {
+					printk(KERN_ERR "Could not update %s"
+					       " property\n", prop_name);
+				}
+
+				prop_data += vd;
+			}
+		}
+	} while (rc == 1);
+
+	of_node_put(dn);
+	kfree(rtas_buf);
+	return 0;
+}
+
+static int add_dt_node(u32 parent_phandle, u32 drc_index)
+{
+	struct device_node *dn;
+	struct device_node *parent_dn;
+	int rc;
+
+	dn = dlpar_configure_connector(drc_index);
+	if (!dn)
+		return -ENOENT;
+
+	parent_dn = of_find_node_by_phandle(parent_phandle);
+	if (!parent_dn) {
+		dlpar_free_cc_nodes(dn);
+		return -ENOENT;
+	}
+
+	dn->parent = parent_dn;
+	rc = dlpar_attach_node(dn);
+	if (rc)
+		dlpar_free_cc_nodes(dn);
+
+	of_node_put(parent_dn);
+	return rc;
+}
+
+static int pseries_devicetree_update(void)
+{
+	char *rtas_buf;
+	u32 *data;
+	int update_nodes_token;
+	int rc;
+
+	update_nodes_token = rtas_token("ibm,update-nodes");
+	if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	do {
+		rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+		if (rc && rc != 1)
+			break;
+
+		data = (u32 *)rtas_buf + 4;
+		while (*data & NODE_ACTION_MASK) {
+			int i;
+			u32 action = *data & NODE_ACTION_MASK;
+			int node_count = *data & NODE_COUNT_MASK;
+
+			data++;
+
+			for (i = 0; i < node_count; i++) {
+				u32 phandle = *data++;
+				u32 drc_index;
+
+				switch (action) {
+				case DELETE_DT_NODE:
+					delete_dt_node(phandle);
+					break;
+				case UPDATE_DT_NODE:
+					update_dt_node(phandle);
+					break;
+				case ADD_DT_NODE:
+					drc_index = *data++;
+					add_dt_node(phandle, drc_index);
+					break;
+				}
+			}
+		}
+	} while (rc == 1);
+
+	kfree(rtas_buf);
+	return rc;
+}
+
+void post_mobility_fixup(void)
+{
+	int rc;
+	int activate_fw_token;
+
+	rc = pseries_devicetree_update();
+	if (rc) {
+		printk(KERN_ERR "Initial post-mobility device tree update "
+		       "failed: %d\n", rc);
+		return;
+	}
+
+	activate_fw_token = rtas_token("ibm,activate-firmware");
+	if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "Could not make post-mobility "
+		       "activate-fw call.\n");
+		return;
+	}
+
+	rc = rtas_call(activate_fw_token, 0, 1, NULL);
+	if (!rc) {
+		rc = pseries_devicetree_update();
+		if (rc)
+			printk(KERN_ERR "Secondary post-mobility device tree "
+			       "update failed: %d\n", rc);
+	} else {
+		printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
+		return;
+	}
+
+	return;
+}
+
+static ssize_t migrate_store(struct class *class, struct class_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct rtas_args args;
+	u64 streamid;
+	int rc;
+
+	rc = strict_strtoull(buf, 0, &streamid);
+	if (rc)
+		return rc;
+
+	memset(&args, 0, sizeof(args));
+	args.token = rtas_token("ibm,suspend-me");
+	args.nargs = 2;
+	args.nret = 1;
+
+	args.args[0] = streamid >> 32 ;
+	args.args[1] = streamid & 0xffffffff;
+	args.rets = &args.args[args.nargs];
+
+	do {
+		args.rets[0] = 0;
+		rc = rtas_ibm_suspend_me(&args);
+		if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE)
+			ssleep(1);
+	} while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE);
+
+	if (rc)
+		return rc;
+	else if (args.rets[0])
+		return args.rets[0];
+
+	post_mobility_fixup();
+	return count;
+}
+
+static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store);
+
+static int __init mobility_sysfs_init(void)
+{
+	int rc;
+
+	mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
+	if (!mobility_kobj)
+		return -ENOMEM;
+
+	rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
+
+	return rc;
+}
+device_initcall(mobility_sysfs_init);
@@ -0,0 +1,491 @@
+/*
+ * Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
+ * Copyright 2006-2007 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+
+#include <asm/rtas.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+
+static int query_token, change_token;
+
+#define RTAS_QUERY_FN		0
+#define RTAS_CHANGE_FN		1
+#define RTAS_RESET_FN		2
+#define RTAS_CHANGE_MSI_FN	3
+#define RTAS_CHANGE_MSIX_FN	4
+
+static struct pci_dn *get_pdn(struct pci_dev *pdev)
+{
+	struct device_node *dn;
+	struct pci_dn *pdn;
+
+	dn = pci_device_to_OF_node(pdev);
+	if (!dn) {
+		dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n");
+		return NULL;
+	}
+
+	pdn = PCI_DN(dn);
+	if (!pdn) {
+		dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n");
+		return NULL;
+	}
+
+	return pdn;
+}
+
+/* RTAS Helpers */
+
+static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
+{
+	u32 addr, seq_num, rtas_ret[3];
+	unsigned long buid;
+	int rc;
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+
+	seq_num = 1;
+	do {
+		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN)
+			rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
+					BUID_HI(buid), BUID_LO(buid),
+					func, num_irqs, seq_num);
+		else
+			rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
+					BUID_HI(buid), BUID_LO(buid),
+					func, num_irqs, seq_num);
+
+		seq_num = rtas_ret[1];
+	} while (rtas_busy_delay(rc));
+
+	/*
+	 * If the RTAS call succeeded, return the number of irqs allocated.
+	 * If not, make sure we return a negative error code.
+	 */
+	if (rc == 0)
+		rc = rtas_ret[0];
+	else if (rc > 0)
+		rc = -rc;
+
+	pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
+		 func, num_irqs, rtas_ret[0], rc);
+
+	return rc;
+}
+
+static void rtas_disable_msi(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn;
+
+	pdn = get_pdn(pdev);
+	if (!pdn)
+		return;
+
+	/*
+	 * disabling MSI with the explicit interface also disables MSI-X
+	 */
+	if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+		/* 
+		 * may have failed because explicit interface is not
+		 * present
+		 */
+		if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+			pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+		}
+	}
+}
+
+static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
+{
+	u32 addr, rtas_ret[2];
+	unsigned long buid;
+	int rc;
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+
+	do {
+		rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
+			       BUID_HI(buid), BUID_LO(buid), offset);
+	} while (rtas_busy_delay(rc));
+
+	if (rc) {
+		pr_debug("rtas_msi: error (%d) querying source number\n", rc);
+		return rc;
+	}
+
+	return rtas_ret[0];
+}
+
+static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	struct msi_desc *entry;
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+
+		irq_set_msi_desc(entry->irq, NULL);
+		irq_dispose_mapping(entry->irq);
+	}
+
+	rtas_disable_msi(pdev);
+}
+
+static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
+{
+	struct device_node *dn;
+	struct pci_dn *pdn;
+	const u32 *req_msi;
+
+	pdn = get_pdn(pdev);
+	if (!pdn)
+		return -ENODEV;
+
+	dn = pdn->node;
+
+	req_msi = of_get_property(dn, prop_name, NULL);
+	if (!req_msi) {
+		pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name);
+		return -ENOENT;
+	}
+
+	if (*req_msi < nvec) {
+		pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
+
+		if (*req_msi == 0) /* Be paranoid */
+			return -ENOSPC;
+
+		return *req_msi;
+	}
+
+	return 0;
+}
+
+static int check_req_msi(struct pci_dev *pdev, int nvec)
+{
+	return check_req(pdev, nvec, "ibm,req#msi");
+}
+
+static int check_req_msix(struct pci_dev *pdev, int nvec)
+{
+	return check_req(pdev, nvec, "ibm,req#msi-x");
+}
+
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn;
+	const u32 *p;
+
+	dn = of_node_get(pci_device_to_OF_node(dev));
+	while (dn) {
+		p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+		if (p) {
+			pr_debug("rtas_msi: found prop on dn %s\n",
+				dn->full_name);
+			*total = *p;
+			return dn;
+		}
+
+		dn = of_get_next_parent(dn);
+	}
+
+	return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn;
+
+	/* Found our PE and assume 8 at that point. */
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn)
+		return NULL;
+
+	dn = eeh_find_device_pe(dn);
+	if (!dn)
+		return NULL;
+
+	/* We actually want the parent */
+	dn = of_get_parent(dn);
+	if (!dn)
+		return NULL;
+
+	/* Hardcode of 8 for old firmwares */
+	*total = 8;
+	pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
+
+	return dn;
+}
+
+struct msi_counts {
+	struct device_node *requestor;
+	int num_devices;
+	int request;
+	int quota;
+	int spare;
+	int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const u32 *p;
+	u32 class;
+
+	pr_debug("rtas_msi: counting %s\n", dn->full_name);
+
+	p = of_get_property(dn, "class-code", NULL);
+	class = p ? *p : 0;
+
+	if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+		counts->num_devices++;
+
+	return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const u32 *p;
+	int req;
+
+	if (dn == counts->requestor)
+		req = counts->request;
+	else {
+		/* We don't know if a driver will try to use MSI or MSI-X,
+		 * so we just have to punt and use the larger of the two. */
+		req = 0;
+		p = of_get_property(dn, "ibm,req#msi", NULL);
+		if (p)
+			req = *p;
+
+		p = of_get_property(dn, "ibm,req#msi-x", NULL);
+		if (p)
+			req = max(req, (int)*p);
+	}
+
+	if (req < counts->quota)
+		counts->spare += counts->quota - req;
+	else if (req > counts->quota)
+		counts->over_quota++;
+
+	return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+	struct device_node *pe_dn;
+	struct msi_counts counts;
+	int total;
+
+	pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+		  request);
+
+	pe_dn = find_pe_total_msi(dev, &total);
+	if (!pe_dn)
+		pe_dn = find_pe_dn(dev, &total);
+
+	if (!pe_dn) {
+		pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
+		goto out;
+	}
+
+	pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
+
+	memset(&counts, 0, sizeof(struct msi_counts));
+
+	/* Work out how many devices we have below this PE */
+	traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+
+	if (counts.num_devices == 0) {
+		pr_err("rtas_msi: found 0 devices under PE for %s\n",
+			pci_name(dev));
+		goto out;
+	}
+
+	counts.quota = total / counts.num_devices;
+	if (request <= counts.quota)
+		goto out;
+
+	/* else, we have some more calculating to do */
+	counts.requestor = pci_device_to_OF_node(dev);
+	counts.request = request;
+	traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+
+	/* If the quota isn't an integer multiple of the total, we can
+	 * use the remainder as spare MSIs for anyone that wants them. */
+	counts.spare += total % counts.num_devices;
+
+	/* Divide any spare by the number of over-quota requestors */
+	if (counts.over_quota)
+		counts.quota += counts.spare / counts.over_quota;
+
+	/* And finally clamp the request to the possibly adjusted quota */
+	request = min(counts.quota, request);
+
+	pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+	of_node_put(pe_dn);
+
+	return request;
+}
+
+static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
+{
+	int quota, rc;
+
+	if (type == PCI_CAP_ID_MSIX)
+		rc = check_req_msix(pdev, nvec);
+	else
+		rc = check_req_msi(pdev, nvec);
+
+	if (rc)
+		return rc;
+
+	quota = msi_quota_for_device(pdev, nvec);
+
+	if (quota && quota < nvec)
+		return quota;
+
+	return 0;
+}
+
+static int check_msix_entries(struct pci_dev *pdev)
+{
+	struct msi_desc *entry;
+	int expected;
+
+	/* There's no way for us to express to firmware that we want
+	 * a discontiguous, or non-zero based, range of MSI-X entries.
+	 * So we must reject such requests. */
+
+	expected = 0;
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (entry->msi_attrib.entry_nr != expected) {
+			pr_debug("rtas_msi: bad MSI-X entries.\n");
+			return -EINVAL;
+		}
+		expected++;
+	}
+
+	return 0;
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	struct pci_dn *pdn;
+	int hwirq, virq, i, rc;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+
+	pdn = get_pdn(pdev);
+	if (!pdn)
+		return -ENODEV;
+
+	if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
+		return -EINVAL;
+
+	/*
+	 * Try the new more explicit firmware interface, if that fails fall
+	 * back to the old interface. The old interface is known to never
+	 * return MSI-Xs.
+	 */
+	if (type == PCI_CAP_ID_MSI) {
+		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+
+		if (rc < 0) {
+			pr_debug("rtas_msi: trying the old firmware call.\n");
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
+		}
+	} else
+		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
+
+	if (rc != nvec) {
+		pr_debug("rtas_msi: rtas_change_msi() failed\n");
+		return rc;
+	}
+
+	i = 0;
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		hwirq = rtas_query_irq_number(pdn, i++);
+		if (hwirq < 0) {
+			pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
+			return hwirq;
+		}
+
+		virq = irq_create_mapping(NULL, hwirq);
+
+		if (virq == NO_IRQ) {
+			pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
+			return -ENOSPC;
+		}
+
+		dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
+		irq_set_msi_desc(virq, entry);
+
+		/* Read config space back so we can restore after reset */
+		read_msi_msg(virq, &msg);
+		entry->msg = msg;
+	}
+
+	return 0;
+}
+
+static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
+{
+	/* No LSI -> leave MSIs (if any) configured */
+	if (pdev->irq == NO_IRQ) {
+		dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
+		return;
+	}
+
+	/* No MSI -> MSIs can't have been assigned by fw, leave LSI */
+	if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
+		dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
+	rtas_disable_msi(pdev);
+}
+
+static int rtas_msi_init(void)
+{
+	query_token  = rtas_token("ibm,query-interrupt-source-number");
+	change_token = rtas_token("ibm,change-msi");
+
+	if ((query_token == RTAS_UNKNOWN_SERVICE) ||
+			(change_token == RTAS_UNKNOWN_SERVICE)) {
+		pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
+		return -1;
+	}
+
+	pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
+
+	WARN_ON(ppc_md.setup_msi_irqs);
+	ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
+	ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	ppc_md.msi_check_device = rtas_msi_check_device;
+
+	WARN_ON(ppc_md.pci_irq_fixup);
+	ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
+
+	return 0;
+}
+arch_initcall(rtas_msi_init);
@@ -0,0 +1,679 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/kmsg_dump.h>
+#include <linux/ctype.h>
+#include <linux/zlib.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+/* Max bytes to read/write in one go */
+#define NVRW_CNT 0x20
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+struct err_log_info {
+	int error_type;
+	unsigned int seq_num;
+};
+
+struct nvram_os_partition {
+	const char *name;
+	int req_size;	/* desired size, in bytes */
+	int min_size;	/* minimum acceptable size (0 means req_size) */
+	long size;	/* size of data portion (excluding err_log_info) */
+	long index;	/* offset of data portion of partition */
+};
+
+static struct nvram_os_partition rtas_log_partition = {
+	.name = "ibm,rtas-log",
+	.req_size = 2079,
+	.min_size = 1055,
+	.index = -1
+};
+
+static struct nvram_os_partition oops_log_partition = {
+	.name = "lnx,oops-log",
+	.req_size = 4000,
+	.min_size = 2000,
+	.index = -1
+};
+
+static const char *pseries_nvram_os_partitions[] = {
+	"ibm,rtas-log",
+	"lnx,oops-log",
+	NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+		enum kmsg_dump_reason reason,
+		const char *old_msgs, unsigned long old_len,
+		const char *new_msgs, unsigned long new_len);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+	.dump = oops_to_nvram
+};
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
+static unsigned long last_unread_rtas_event;	/* timestamp */
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a prefix.
+ * The prefix is just a u16 holding the length of the compressed* text.
+ * (*Or uncompressed, if compression fails.)  oops_buf[] gets written
+ * to NVRAM.
+ *
+ * oops_len points to the prefix.  oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * |		+- oops_data
+ * v		v
+ * +------------+-----------------------------------------------+
+ * | length	| text                                          |
+ * | (2 bytes)	| (oops_data_sz bytes)                          |
+ * +------------+-----------------------------------------------+
+ * ^
+ * +- oops_len
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static u16 *oops_len;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	char *p = buf;
+
+
+	if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+		
+		if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		memcpy(p, nvram_buf, len);
+
+		p += len;
+		i += len;
+	}
+
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	const char *p = buf;
+
+	if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+
+		memcpy(nvram_buf, p, len);
+
+		if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		p += len;
+		i += len;
+	}
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+	return nvram_size ? nvram_size : -ENODEV;
+}
+
+
+/* nvram_write_os_partition, nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.  If we have a severe error there
+ * is no way to guarantee that the OS or the machine is in a state to
+ * get back to user land and write the error to disk.  For example if
+ * the SCSI device driver causes a Machine Check by writing to a bad
+ * IO address, there is no way of guaranteeing that the device driver
+ * is in any state that is would also be able to write the error data
+ * captured to disk, thus we buffer it in NVRAM for analysis on the
+ * next boot.
+ *
+ * In NVRAM the partition containing the error log buffer will looks like:
+ * Header (in bytes):
+ * +-----------+----------+--------+------------+------------------+
+ * | signature | checksum | length | name       | data             |
+ * |0          |1         |2      3|4         15|16        length-1|
+ * +-----------+----------+--------+------------+------------------+
+ *
+ * The 'data' section would look like (in bytes):
+ * +--------------+------------+-----------------------------------+
+ * | event_logged | sequence # | error log                         |
+ * |0            3|4          7|8                  error_log_size-1|
+ * +--------------+------------+-----------------------------------+
+ *
+ * event_logged: 0 if event has not been logged to syslog, 1 if it has
+ * sequence #: The unique sequence # for each event. (until it wraps)
+ * error log: The error log from event_scan
+ */
+int nvram_write_os_partition(struct nvram_os_partition *part, char * buff,
+		int length, unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (part->index == -1) {
+		return -ESPIPE;
+	}
+
+	if (length > part->size) {
+		length = part->size;
+	}
+
+	info.error_type = err_type;
+	info.seq_num = error_log_cnt;
+
+	tmp_index = part->index;
+
+	rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_write(buff, length, &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
+		return rc;
+	}
+	
+	return 0;
+}
+
+int nvram_write_error_log(char * buff, int length,
+                          unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+	if (!rc)
+		last_unread_rtas_event = get_seconds();
+	return rc;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char * buff, int length,
+                         unsigned int * err_type, unsigned int * error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (rtas_log_partition.index == -1)
+		return -1;
+
+	if (length > rtas_log_partition.size)
+		length = rtas_log_partition.size;
+
+	tmp_index = rtas_log_partition.index;
+
+	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_read(buff, length, &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		return rc;
+	}
+
+	*error_log_cnt = info.seq_num;
+	*err_type = info.error_type;
+
+	return 0;
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+	loff_t tmp_index;
+	int clear_word = ERR_FLAG_ALREADY_LOGGED;
+	int rc;
+
+	if (rtas_log_partition.index == -1)
+		return -1;
+
+	tmp_index = rtas_log_partition.index;
+	
+	rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+	last_unread_rtas_event = 0;
+
+	return 0;
+}
+
+/* pseries_nvram_init_os_partition
+ *
+ * This sets up a partition with an "OS" signature.
+ *
+ * The general strategy is the following:
+ * 1.) If a partition with the indicated name already exists...
+ *	- If it's large enough, use it.
+ *	- Otherwise, recycle it and keep going.
+ * 2.) Search for a free partition that is large enough.
+ * 3.) If there's not a free partition large enough, recycle any obsolete
+ * OS partitions and try again.
+ * 4.) Will first try getting a chunk that will satisfy the requested size.
+ * 5.) If a chunk of the requested size cannot be allocated, then try finding
+ * a chunk that will satisfy the minum needed.
+ *
+ * Returns 0 on success, else -1.
+ */
+static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
+									*part)
+{
+	loff_t p;
+	int size;
+
+	/* Scan nvram for partitions */
+	nvram_scan_partitions();
+
+	/* Look for ours */
+	p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
+
+	/* Found one but too small, remove it */
+	if (p && size < part->min_size) {
+		pr_info("nvram: Found too small %s partition,"
+					" removing it...\n", part->name);
+		nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
+		p = 0;
+	}
+
+	/* Create one if we didn't find */
+	if (!p) {
+		p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		if (p == -ENOSPC) {
+			pr_info("nvram: No room to create %s partition, "
+				"deleting any obsolete OS partitions...\n",
+				part->name);
+			nvram_remove_partition(NULL, NVRAM_SIG_OS,
+						pseries_nvram_os_partitions);
+			p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		}
+	}
+
+	if (p <= 0) {
+		pr_err("nvram: Failed to find or create %s"
+		       " partition, err %d\n", part->name, (int)p);
+		return -1;
+	}
+
+	part->index = p;
+	part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
+	
+	return 0;
+}
+
+static void __init nvram_init_oops_partition(int rtas_partition_exists)
+{
+	int rc;
+
+	rc = pseries_nvram_init_os_partition(&oops_log_partition);
+	if (rc != 0) {
+		if (!rtas_partition_exists)
+			return;
+		pr_notice("nvram: Using %s partition to log both"
+			" RTAS errors and oops/panic reports\n",
+			rtas_log_partition.name);
+		memcpy(&oops_log_partition, &rtas_log_partition,
+						sizeof(rtas_log_partition));
+	}
+	oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+	if (!oops_buf) {
+		pr_err("nvram: No memory for %s partition\n",
+						oops_log_partition.name);
+		return;
+	}
+	oops_len = (u16*) oops_buf;
+	oops_data = oops_buf + sizeof(u16);
+	oops_data_sz = oops_log_partition.size - sizeof(u16);
+
+	/*
+	 * Figure compression (preceded by elimination of each line's <n>
+	 * severity prefix) will reduce the oops/panic report to at most
+	 * 45% of its original size.
+	 */
+	big_oops_buf_sz = (oops_data_sz * 100) / 45;
+	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+	if (big_oops_buf) {
+		stream.workspace = kmalloc(zlib_deflate_workspacesize(
+				WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+		if (!stream.workspace) {
+			pr_err("nvram: No memory for compression workspace; "
+				"skipping compression of %s partition data\n",
+				oops_log_partition.name);
+			kfree(big_oops_buf);
+			big_oops_buf = NULL;
+		}
+	} else {
+		pr_err("No memory for uncompressed %s data; "
+			"skipping compression\n", oops_log_partition.name);
+		stream.workspace = NULL;
+	}
+
+	rc = kmsg_dump_register(&nvram_kmsg_dumper);
+	if (rc != 0) {
+		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
+		kfree(oops_buf);
+		kfree(big_oops_buf);
+		kfree(stream.workspace);
+	}
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+	int rc;
+
+	rc = pseries_nvram_init_os_partition(&rtas_log_partition);
+	nvram_init_oops_partition(rc == 0);
+	return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
+
+int __init pSeries_nvram_init(void)
+{
+	struct device_node *nvram;
+	const unsigned int *nbytes_p;
+	unsigned int proplen;
+
+	nvram = of_find_node_by_type(NULL, "nvram");
+	if (nvram == NULL)
+		return -ENODEV;
+
+	nbytes_p = of_get_property(nvram, "#bytes", &proplen);
+	if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
+		of_node_put(nvram);
+		return -EIO;
+	}
+
+	nvram_size = *nbytes_p;
+
+	nvram_fetch = rtas_token("nvram-fetch");
+	nvram_store = rtas_token("nvram-store");
+	printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+	of_node_put(nvram);
+
+	ppc_md.nvram_read	= pSeries_nvram_read;
+	ppc_md.nvram_write	= pSeries_nvram_write;
+	ppc_md.nvram_size	= pSeries_nvram_get_size;
+
+	return 0;
+}
+
+/*
+ * Try to capture the last capture_len bytes of the printk buffer.  Return
+ * the amount actually captured.
+ */
+static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
+				const char *new_msgs, size_t new_len,
+				char *captured, size_t capture_len)
+{
+	if (new_len >= capture_len) {
+		memcpy(captured, new_msgs + (new_len - capture_len),
+								capture_len);
+		return capture_len;
+	} else {
+		/* Grab the end of old_msgs. */
+		size_t old_tail_len = min(old_len, capture_len - new_len);
+		memcpy(captured, old_msgs + (old_len - old_tail_len),
+								old_tail_len);
+		memcpy(captured + old_tail_len, new_msgs, new_len);
+		return old_tail_len + new_len;
+	}
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+	return (oops_log_partition.index == rtas_log_partition.index
+		&& last_unread_rtas_event
+		&& get_seconds() - last_unread_rtas_event <=
+						NVRAM_RTAS_READ_TIMEOUT);
+}
+
+/* Squeeze out each line's <n> severity prefix. */
+static size_t elide_severities(char *buf, size_t len)
+{
+	char *in, *out, *buf_end = buf + len;
+	/* Assume a <n> at the very beginning marks the start of a line. */
+	int newline = 1;
+
+	in = out = buf;
+	while (in < buf_end) {
+		if (newline && in+3 <= buf_end &&
+				*in == '<' && isdigit(in[1]) && in[2] == '>') {
+			in += 3;
+			newline = 0;
+		} else {
+			newline = (*in == '\n');
+			*out++ = *in++;
+		}
+	}
+	return out - buf;
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+								oops_data_sz);
+	if (zipped_len < 0) {
+		pr_err("nvram: compression failed; returned %d\n", zipped_len);
+		pr_err("nvram: logging uncompressed oops/panic report\n");
+		return -1;
+	}
+	*oops_len = (u16) zipped_len;
+	return 0;
+}
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer.  We want to capture as much
+ * of the printk buffer as possible.  First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition.  If that's too much, go back and capture uncompressed text.
+ */
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+		enum kmsg_dump_reason reason,
+		const char *old_msgs, unsigned long old_len,
+		const char *new_msgs, unsigned long new_len)
+{
+	static unsigned int oops_count = 0;
+	static bool panicking = false;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long flags;
+	size_t text_len;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+	int rc = -1;
+
+	switch (reason) {
+	case KMSG_DUMP_RESTART:
+	case KMSG_DUMP_HALT:
+	case KMSG_DUMP_POWEROFF:
+		/* These are almost always orderly shutdowns. */
+		return;
+	case KMSG_DUMP_OOPS:
+		break;
+	case KMSG_DUMP_PANIC:
+		panicking = true;
+		break;
+	case KMSG_DUMP_EMERG:
+		if (panicking)
+			/* Panic report already captured. */
+			return;
+		break;
+	default:
+		pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
+						__FUNCTION__, (int) reason);
+		return;
+	}
+
+	if (clobbering_unread_rtas_event())
+		return;
+
+	if (!spin_trylock_irqsave(&lock, flags))
+		return;
+
+	if (big_oops_buf) {
+		text_len = capture_last_msgs(old_msgs, old_len,
+			new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
+		text_len = elide_severities(big_oops_buf, text_len);
+		rc = zip_oops(text_len);
+	}
+	if (rc != 0) {
+		text_len = capture_last_msgs(old_msgs, old_len,
+				new_msgs, new_len, oops_data, oops_data_sz);
+		err_type = ERR_TYPE_KERNEL_PANIC;
+		*oops_len = (u16) text_len;
+	}
+
+	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
+
+	spin_unlock_irqrestore(&lock, flags);
+}
@@ -0,0 +1,37 @@
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+	CPU_STATE_OFFLINE,
+	CPU_STATE_INACTIVE,
+	CPU_STATE_ONLINE,
+	CPU_MAX_OFFLINE_STATES
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+#else
+static inline enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return CPU_STATE_ONLINE;
+}
+
+static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_default_offline_state(int cpu)
+{
+}
+#endif
+
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+#endif
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/*
+	 * Add IBM loc code (slot) as a prefix to the device names for service
+	 */
+	dn = pci_device_to_OF_node(dev);
+	if (dn) {
+		const char *loc_code = of_get_property(dn, "ibm,loc-code", 0);
+		if (loc_code) {
+			int loc_len = strlen(loc_code);
+			if (loc_len < sizeof(dev->dev.name)) {
+				memmove(dev->dev.name+loc_len+1, dev->dev.name,
+					sizeof(dev->dev.name)-loc_len-1);
+				memcpy(dev->dev.name, loc_code, loc_len);
+				dev->dev.name[loc_len] = ' ';
+				dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+			}
+		}
+	}
+}   
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+static void __init pSeries_request_regions(void)
+{
+	if (!isa_io_base)
+		return;
+
+	request_region(0x20,0x20,"pic1");
+	request_region(0xa0,0x20,"pic2");
+	request_region(0x00,0x20,"dma1");
+	request_region(0x40,0x20,"timer");
+	request_region(0x80,0x10,"dma page reg");
+	request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+	pSeries_request_regions();
+
+	pci_addr_cache_build();
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610/p615/p630. We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+	int i;
+	unsigned int reg;
+
+	if (!machine_is(pseries))
+		return;
+
+	printk("Using INTC for W82c105 IDE controller.\n");
+	pci_read_config_dword(dev, 0x40, &reg);
+	/* Enable LEGIRQ to use INTC instead of ISA interrupts */
+	pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+		/* zap the 2nd function of the winbond chip */
+		if (dev->resource[i].flags & IORESOURCE_IO
+		    && dev->bus->number == 0 && dev->devfn == 0x81)
+			dev->resource[i].flags &= ~IORESOURCE_IO;
+		if (dev->resource[i].start == 0 && dev->resource[i].end) {
+			dev->resource[i].flags = 0;
+			dev->resource[i].end = 0;
+		}
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+			 fixup_winbond_82c105);
@@ -0,0 +1,212 @@
+/*
+ * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
+ * for RPA-compliant PPC64 platform.
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+static struct pci_bus *
+find_bus_among_children(struct pci_bus *bus,
+                        struct device_node *dn)
+{
+	struct pci_bus *child = NULL;
+	struct list_head *tmp;
+	struct device_node *busdn;
+
+	busdn = pci_bus_to_OF_node(bus);
+	if (busdn == dn)
+		return bus;
+
+	list_for_each(tmp, &bus->children) {
+		child = find_bus_among_children(pci_bus_b(tmp), dn);
+		if (child)
+			break;
+	};
+	return child;
+}
+
+struct pci_bus *
+pcibios_find_pci_bus(struct device_node *dn)
+{
+	struct pci_dn *pdn = dn->data;
+
+	if (!pdn  || !pdn->phb || !pdn->phb->bus)
+		return NULL;
+
+	return find_bus_among_children(pdn->phb->bus, dn);
+}
+EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+ 	struct pci_dev *dev, *tmp;
+	struct pci_bus *child_bus;
+
+	/* First go down child busses */
+	list_for_each_entry(child_bus, &bus->children, node)
+		pcibios_remove_pci_devices(child_bus);
+
+	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+		 pci_domain_nr(bus),  bus->number);
+	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+		pr_debug("     * Removing %s...\n", pci_name(dev));
+		eeh_remove_bus_device(dev);
+ 		pci_stop_and_remove_bus_device(dev);
+ 	}
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices.  (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+	int slotno, num, mode, pass, max;
+	struct pci_dev *dev;
+	struct device_node *dn = pci_bus_to_OF_node(bus);
+
+	eeh_add_device_tree_early(dn);
+
+	mode = PCI_PROBE_NORMAL;
+	if (ppc_md.pci_probe_mode)
+		mode = ppc_md.pci_probe_mode(bus);
+
+	if (mode == PCI_PROBE_DEVTREE) {
+		/* use ofdt-based probe */
+		of_rescan_bus(dn, bus);
+	} else if (mode == PCI_PROBE_NORMAL) {
+		/* use legacy probe */
+		slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+		num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+		if (!num)
+			return;
+		pcibios_setup_bus_devices(bus);
+		max = bus->secondary;
+		for (pass=0; pass < 2; pass++)
+			list_for_each_entry(dev, &bus->devices, bus_list) {
+			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+			    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+				max = pci_scan_bridge(bus, dev, max, pass);
+		}
+	}
+	pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
+
+struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
+{
+	struct pci_controller *phb;
+
+	pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name);
+
+	phb = pcibios_alloc_controller(dn);
+	if (!phb)
+		return NULL;
+	rtas_setup_phb(phb);
+	pci_process_bridge_OF_ranges(phb, dn, 0);
+
+	pci_devs_phb_init_dynamic(phb);
+
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
+	if (dn->child)
+		eeh_add_device_tree_early(dn);
+
+	pcibios_scan_phb(phb);
+	pcibios_finish_adding_to_bus(phb->bus);
+
+	return phb;
+}
+EXPORT_SYMBOL_GPL(init_phb_dynamic);
+
+/* RPA-specific bits for removing PHBs */
+int remove_phb_dynamic(struct pci_controller *phb)
+{
+	struct pci_bus *b = phb->bus;
+	struct resource *res;
+	int rc, i;
+
+	pr_debug("PCI: Removing PHB %04x:%02x...\n",
+		 pci_domain_nr(b), b->number);
+
+	/* We cannot to remove a root bus that has children */
+	if (!(list_empty(&b->children) && list_empty(&b->devices)))
+		return -EBUSY;
+
+	/* We -know- there aren't any child devices anymore at this stage
+	 * and thus, we can safely unmap the IO space as it's not in use
+	 */
+	res = &phb->io_resource;
+	if (res->flags & IORESOURCE_IO) {
+		rc = pcibios_unmap_io_space(b);
+		if (rc) {
+			printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
+			       __func__, b->name);
+			return 1;
+		}
+	}
+
+	/* Unregister the bridge device from sysfs and remove the PCI bus */
+	device_unregister(b->bridge);
+	phb->bus = NULL;
+	pci_remove_bus(b);
+
+	/* Now release the IO resource */
+	if (res->flags & IORESOURCE_IO)
+		release_resource(res);
+
+	/* Release memory resources */
+	for (i = 0; i < 3; ++i) {
+		res = &phb->mem_resources[i];
+		if (!(res->flags & IORESOURCE_MEM))
+			continue;
+		release_resource(res);
+	}
+
+	/* Free pci_controller data structure */
+	pcibios_free_controller(phb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(remove_phb_dynamic);
@@ -0,0 +1,276 @@
+#ifndef _PSERIES_PLPAR_WRAPPERS_H
+#define _PSERIES_PLPAR_WRAPPERS_H
+
+#include <linux/string.h>
+
+#include <asm/hvcall.h>
+#include <asm/paca.h>
+#include <asm/page.h>
+
+/* Get state of physical CPU from query_cpu_stopped */
+int smp_query_cpu_stopped(unsigned int pcpu);
+#define QCSS_STOPPED 0
+#define QCSS_STOPPING 1
+#define QCSS_NOT_STOPPED 2
+#define QCSS_HARDWARE_ERROR -1
+#define QCSS_HARDWARE_BUSY -2
+
+static inline long poll_pending(void)
+{
+	return plpar_hcall_norets(H_POLL_PENDING);
+}
+
+static inline u8 get_cede_latency_hint(void)
+{
+	return get_lppaca()->gpr5_dword.fields.cede_latency_hint;
+}
+
+static inline void set_cede_latency_hint(u8 latency_hint)
+{
+	get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint;
+}
+
+static inline long cede_processor(void)
+{
+	return plpar_hcall_norets(H_CEDE);
+}
+
+static inline long extended_cede_processor(unsigned long latency_hint)
+{
+	long rc;
+	u8 old_latency_hint = get_cede_latency_hint();
+
+	set_cede_latency_hint(latency_hint);
+	rc = cede_processor();
+	set_cede_latency_hint(old_latency_hint);
+
+	return rc;
+}
+
+static inline long vpa_call(unsigned long flags, unsigned long cpu,
+		unsigned long vpa)
+{
+	/* flags are in bits 16-18 (counting from most significant bit) */
+	flags = flags << (63 - 18);
+
+	return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
+}
+
+static inline long unregister_vpa(unsigned long cpu)
+{
+	return vpa_call(0x5, cpu, 0);
+}
+
+static inline long register_vpa(unsigned long cpu, unsigned long vpa)
+{
+	return vpa_call(0x1, cpu, vpa);
+}
+
+static inline long unregister_slb_shadow(unsigned long cpu)
+{
+	return vpa_call(0x7, cpu, 0);
+}
+
+static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
+{
+	return vpa_call(0x3, cpu, vpa);
+}
+
+static inline long unregister_dtl(unsigned long cpu)
+{
+	return vpa_call(0x6, cpu, 0);
+}
+
+static inline long register_dtl(unsigned long cpu, unsigned long vpa)
+{
+	return vpa_call(0x2, cpu, vpa);
+}
+
+static inline long plpar_page_set_loaned(unsigned long vpa)
+{
+	unsigned long cmo_page_sz = cmo_get_page_size();
+	long rc = 0;
+	int i;
+
+	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
+
+	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
+				   vpa + i - cmo_page_sz, 0);
+
+	return rc;
+}
+
+static inline long plpar_page_set_active(unsigned long vpa)
+{
+	unsigned long cmo_page_sz = cmo_get_page_size();
+	long rc = 0;
+	int i;
+
+	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
+
+	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
+				   vpa + i - cmo_page_sz, 0);
+
+	return rc;
+}
+
+extern void vpa_init(int cpu);
+
+static inline long plpar_pte_enter(unsigned long flags,
+		unsigned long hpte_group, unsigned long hpte_v,
+		unsigned long hpte_r, unsigned long *slot)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r);
+
+	*slot = retbuf[0];
+
+	return rc;
+}
+
+static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
+		unsigned long avpn, unsigned long *old_pteh_ret,
+		unsigned long *old_ptel_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn);
+
+	*old_pteh_ret = retbuf[0];
+	*old_ptel_ret = retbuf[1];
+
+	return rc;
+}
+
+/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */
+static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex,
+		unsigned long avpn, unsigned long *old_pteh_ret,
+		unsigned long *old_ptel_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn);
+
+	*old_pteh_ret = retbuf[0];
+	*old_ptel_ret = retbuf[1];
+
+	return rc;
+}
+
+static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
+		unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_READ, retbuf, flags, ptex);
+
+	*old_pteh_ret = retbuf[0];
+	*old_ptel_ret = retbuf[1];
+
+	return rc;
+}
+
+/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */
+static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex,
+		unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex);
+
+	*old_pteh_ret = retbuf[0];
+	*old_ptel_ret = retbuf[1];
+
+	return rc;
+}
+
+/*
+ * plpar_pte_read_4_raw can be called in real mode.
+ * ptes must be 8*sizeof(unsigned long)
+ */
+static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex,
+					unsigned long *ptes)
+
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex);
+
+	memcpy(ptes, retbuf, 8*sizeof(unsigned long));
+
+	return rc;
+}
+
+static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
+		unsigned long avpn)
+{
+	return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
+}
+
+static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
+		unsigned long *tce_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba);
+
+	*tce_ret = retbuf[0];
+
+	return rc;
+}
+
+static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
+		unsigned long tceval)
+{
+	return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
+}
+
+static inline long plpar_tce_put_indirect(unsigned long liobn,
+		unsigned long ioba, unsigned long page, unsigned long count)
+{
+	return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
+}
+
+static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
+		unsigned long tceval, unsigned long count)
+{
+	return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
+}
+
+static inline long plpar_get_term_char(unsigned long termno,
+		unsigned long *len_ret, char *buf_ret)
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	unsigned long *lbuf = (unsigned long *)buf_ret;	/* TODO: alignment? */
+
+	rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno);
+
+	*len_ret = retbuf[0];
+	lbuf[0] = retbuf[1];
+	lbuf[1] = retbuf[2];
+
+	return rc;
+}
+
+static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
+		const char *buffer)
+{
+	unsigned long *lbuf = (unsigned long *)buffer;	/* TODO: alignment? */
+	return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
+			lbuf[1]);
+}
+
+#endif /* _PSERIES_PLPAR_WRAPPERS_H */
@@ -0,0 +1,81 @@
+/*
+ *  Interface for power-management for ppc64 compliant platform
+ *
+ *  Manish Ahuja <mahuja@us.ibm.com>
+ *
+ *  Feb 2007
+ *
+ *  Copyright (C) 2007 IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+unsigned long rtas_poweron_auto; /* default and normal state is 0 */
+
+static ssize_t auto_poweron_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%lu\n", rtas_poweron_auto);
+}
+
+static ssize_t auto_poweron_store(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  const char *buf, size_t n)
+{
+	int ret;
+	unsigned long ups_restart;
+	ret = sscanf(buf, "%lu", &ups_restart);
+
+	if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
+		rtas_poweron_auto = ups_restart;
+		return n;
+	}
+	return -EINVAL;
+}
+
+static struct kobj_attribute auto_poweron_attr =
+	__ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
+
+#ifndef CONFIG_PM
+struct kobject *power_kobj;
+
+static struct attribute *g[] = {
+        &auto_poweron_attr.attr,
+        NULL,
+};
+
+static struct attribute_group attr_group = {
+        .attrs = g,
+};
+
+static int __init pm_init(void)
+{
+	power_kobj = kobject_create_and_add("power", NULL);
+	if (!power_kobj)
+		return -ENOMEM;
+	return sysfs_create_group(power_kobj, &attr_group);
+}
+core_initcall(pm_init);
+#else
+static int __init apo_pm_init(void)
+{
+	return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
+}
+__initcall(apo_pm_init);
+#endif
@@ -0,0 +1,343 @@
+/*
+ *  processor_idle - idle state cpuidle driver.
+ *  Adapted from drivers/idle/intel_idle.c and
+ *  drivers/acpi/processor_idle.c
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/runlatch.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+struct cpuidle_driver pseries_idle_driver = {
+	.name =		"pseries_idle",
+	.owner =	THIS_MODULE,
+};
+
+#define MAX_IDLE_STATE_COUNT	2
+
+static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
+static struct cpuidle_device __percpu *pseries_cpuidle_devices;
+static struct cpuidle_state *cpuidle_state_table;
+
+void update_smt_snooze_delay(int snooze)
+{
+	struct cpuidle_driver *drv = cpuidle_get_driver();
+	if (drv)
+		drv->states[0].target_residency = snooze;
+}
+
+static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t *kt_before)
+{
+
+	*kt_before = ktime_get_real();
+	*in_purr = mfspr(SPRN_PURR);
+	/*
+	 * Indicate to the HV that we are idle. Now would be
+	 * a good time to find other work to dispatch.
+	 */
+	get_lppaca()->idle = 1;
+}
+
+static inline  s64 idle_loop_epilog(unsigned long in_purr, ktime_t kt_before)
+{
+	get_lppaca()->wait_state_cycles += mfspr(SPRN_PURR) - in_purr;
+	get_lppaca()->idle = 0;
+
+	return ktime_to_us(ktime_sub(ktime_get_real(), kt_before));
+}
+
+static int snooze_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	unsigned long in_purr;
+	ktime_t kt_before;
+	unsigned long start_snooze;
+	long snooze = drv->states[0].target_residency;
+
+	idle_loop_prolog(&in_purr, &kt_before);
+
+	if (snooze) {
+		start_snooze = get_tb() + snooze * tb_ticks_per_usec;
+		local_irq_enable();
+		set_thread_flag(TIF_POLLING_NRFLAG);
+
+		while ((snooze < 0) || (get_tb() < start_snooze)) {
+			if (need_resched() || cpu_is_offline(dev->cpu))
+				goto out;
+			ppc64_runlatch_off();
+			HMT_low();
+			HMT_very_low();
+		}
+
+		HMT_medium();
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb();
+		local_irq_disable();
+	}
+
+out:
+	HMT_medium();
+	dev->last_residency =
+		(int)idle_loop_epilog(in_purr, kt_before);
+	return index;
+}
+
+static void check_and_cede_processor(void)
+{
+	/*
+	 * Interrupts are soft-disabled at this point,
+	 * but not hard disabled. So an interrupt might have
+	 * occurred before entering NAP, and would be potentially
+	 * lost (edge events, decrementer events, etc...) unless
+	 * we first hard disable then check.
+	 */
+	hard_irq_disable();
+	if (get_paca()->irq_happened == 0)
+		cede_processor();
+}
+
+static int dedicated_cede_loop(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	unsigned long in_purr;
+	ktime_t kt_before;
+
+	idle_loop_prolog(&in_purr, &kt_before);
+	get_lppaca()->donate_dedicated_cpu = 1;
+
+	ppc64_runlatch_off();
+	HMT_medium();
+	check_and_cede_processor();
+
+	get_lppaca()->donate_dedicated_cpu = 0;
+	dev->last_residency =
+		(int)idle_loop_epilog(in_purr, kt_before);
+	return index;
+}
+
+static int shared_cede_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	unsigned long in_purr;
+	ktime_t kt_before;
+
+	idle_loop_prolog(&in_purr, &kt_before);
+
+	/*
+	 * Yield the processor to the hypervisor.  We return if
+	 * an external interrupt occurs (which are driven prior
+	 * to returning here) or if a prod occurs from another
+	 * processor. When returning here, external interrupts
+	 * are enabled.
+	 */
+	check_and_cede_processor();
+
+	dev->last_residency =
+		(int)idle_loop_epilog(in_purr, kt_before);
+	return index;
+}
+
+/*
+ * States for dedicated partition case.
+ */
+static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.desc = "snooze",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &snooze_loop },
+	{ /* CEDE */
+		.name = "CEDE",
+		.desc = "CEDE",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 1,
+		.target_residency = 10,
+		.enter = &dedicated_cede_loop },
+};
+
+/*
+ * States for shared partition case.
+ */
+static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
+	{ /* Shared Cede */
+		.name = "Shared Cede",
+		.desc = "Shared Cede",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &shared_cede_loop },
+};
+
+int pseries_notify_cpuidle_add_cpu(int cpu)
+{
+	struct cpuidle_device *dev =
+			per_cpu_ptr(pseries_cpuidle_devices, cpu);
+	if (dev && cpuidle_get_driver()) {
+		cpuidle_disable_device(dev);
+		cpuidle_enable_device(dev);
+	}
+	return 0;
+}
+
+/*
+ * pseries_cpuidle_driver_init()
+ */
+static int pseries_cpuidle_driver_init(void)
+{
+	int idle_state;
+	struct cpuidle_driver *drv = &pseries_idle_driver;
+
+	drv->state_count = 0;
+
+	for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
+
+		if (idle_state > max_idle_state)
+			break;
+
+		/* is the state not enabled? */
+		if (cpuidle_state_table[idle_state].enter == NULL)
+			continue;
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[idle_state];
+
+		if (cpuidle_state_table == dedicated_states)
+			drv->states[drv->state_count].target_residency =
+				__get_cpu_var(smt_snooze_delay);
+
+		drv->state_count += 1;
+	}
+
+	return 0;
+}
+
+/* pseries_idle_devices_uninit(void)
+ * unregister cpuidle devices and de-allocate memory
+ */
+static void pseries_idle_devices_uninit(void)
+{
+	int i;
+	struct cpuidle_device *dev;
+
+	for_each_possible_cpu(i) {
+		dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+		cpuidle_unregister_device(dev);
+	}
+
+	free_percpu(pseries_cpuidle_devices);
+	return;
+}
+
+/* pseries_idle_devices_init()
+ * allocate, initialize and register cpuidle device
+ */
+static int pseries_idle_devices_init(void)
+{
+	int i;
+	struct cpuidle_driver *drv = &pseries_idle_driver;
+	struct cpuidle_device *dev;
+
+	pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (pseries_cpuidle_devices == NULL)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+		dev->state_count = drv->state_count;
+		dev->cpu = i;
+		if (cpuidle_register_device(dev)) {
+			printk(KERN_DEBUG \
+				"cpuidle_register_device %d failed!\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * pseries_idle_probe()
+ * Choose state table for shared versus dedicated partition
+ */
+static int pseries_idle_probe(void)
+{
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -ENODEV;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		return -ENODEV;
+
+	if (max_idle_state == 0) {
+		printk(KERN_DEBUG "pseries processor idle disabled.\n");
+		return -EPERM;
+	}
+
+	if (get_lppaca()->shared_proc)
+		cpuidle_state_table = shared_states;
+	else
+		cpuidle_state_table = dedicated_states;
+
+	return 0;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+	int retval;
+
+	retval = pseries_idle_probe();
+	if (retval)
+		return retval;
+
+	pseries_cpuidle_driver_init();
+	retval = cpuidle_register_driver(&pseries_idle_driver);
+	if (retval) {
+		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
+		return retval;
+	}
+
+	retval = pseries_idle_devices_init();
+	if (retval) {
+		pseries_idle_devices_uninit();
+		cpuidle_unregister_driver(&pseries_idle_driver);
+		return retval;
+	}
+
+	printk(KERN_DEBUG "pseries_idle_driver registered\n");
+
+	return 0;
+}
+
+static void __exit pseries_processor_idle_exit(void)
+{
+
+	pseries_idle_devices_uninit();
+	cpuidle_unregister_driver(&pseries_idle_driver);
+
+	return;
+}
+
+module_init(pseries_processor_idle_init);
+module_exit(pseries_processor_idle_exit);
+
+MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Cpuidle driver for POWER");
+MODULE_LICENSE("GPL");
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _PSERIES_PSERIES_H
+#define _PSERIES_PSERIES_H
+
+#include <linux/interrupt.h>
+
+struct device_node;
+
+extern void request_event_sources_irqs(struct device_node *np,
+				       irq_handler_t handler, const char *name);
+
+#include <linux/of.h>
+
+extern void __init fw_feature_init(const char *hypertas, unsigned long len);
+
+struct pt_regs;
+
+extern int pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+#ifdef CONFIG_SMP
+extern void smp_init_pseries_mpic(void);
+extern void smp_init_pseries_xics(void);
+#else
+static inline void smp_init_pseries_mpic(void) { };
+static inline void smp_init_pseries_xics(void) { };
+#endif
+
+#ifdef CONFIG_KEXEC
+extern void setup_kexec_cpu_down_xics(void);
+extern void setup_kexec_cpu_down_mpic(void);
+#else
+static inline void setup_kexec_cpu_down_xics(void) { }
+static inline void setup_kexec_cpu_down_mpic(void) { }
+#endif
+
+extern void pSeries_final_fixup(void);
+
+/* Poweron flag used for enabling auto ups restart */
+extern unsigned long rtas_poweron_auto;
+
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
+
+/* Dynamic logical Partitioning/Mobility */
+extern void dlpar_free_cc_nodes(struct device_node *);
+extern void dlpar_free_cc_property(struct property *);
+extern struct device_node *dlpar_configure_connector(u32);
+extern int dlpar_attach_node(struct device_node *);
+extern int dlpar_detach_node(struct device_node *);
+
+/* Snooze Delay, pseries_idle */
+DECLARE_PER_CPU(long, smt_snooze_delay);
+
+#endif /* _PSERIES_PSERIES_H */
@@ -0,0 +1,323 @@
+/*
+ * POWER platform energy management driver
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This pseries platform device driver provides access to
+ * platform energy management capabilities.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_energy"
+
+/* Driver flags */
+
+static int sysfs_entries;
+
+/* Helper routines */
+
+/*
+ * Routine to detect firmware support for hcall
+ * return 1 if H_BEST_ENERGY is supported
+ * else return 0
+ */
+
+static int check_for_h_best_energy(void)
+{
+	struct device_node *rtas = NULL;
+	const char *hypertas, *s;
+	int length;
+	int rc = 0;
+
+	rtas = of_find_node_by_path("/rtas");
+	if (!rtas)
+		return 0;
+
+	hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length);
+	if (!hypertas) {
+		of_node_put(rtas);
+		return 0;
+	}
+
+	/* hypertas will have list of strings with hcall names */
+	for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) {
+		if (!strncmp("hcall-best-energy-1", s, 19)) {
+			rc = 1; /* Found the string */
+			break;
+		}
+	}
+	of_node_put(rtas);
+	return rc;
+}
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+static u32 cpu_to_drc_index(int cpu)
+{
+	struct device_node *dn = NULL;
+	const int *indexes;
+	int i;
+	int rc = 1;
+	u32 ret = 0;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+	indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+	if (indexes == NULL)
+		goto err_of_node_put;
+	/* Convert logical cpu number to core number */
+	i = cpu_core_index_of_thread(cpu);
+	/*
+	 * The first element indexes[0] is the number of drc_indexes
+	 * returned in the list.  Hence i+1 will get the drc_index
+	 * corresponding to core number i.
+	 */
+	WARN_ON(i > indexes[0]);
+	ret = indexes[i + 1];
+	rc = 0;
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
+	return ret;
+}
+
+static int drc_index_to_cpu(u32 drc_index)
+{
+	struct device_node *dn = NULL;
+	const int *indexes;
+	int i, cpu = 0;
+	int rc = 1;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+	indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+	if (indexes == NULL)
+		goto err_of_node_put;
+	/*
+	 * First element in the array is the number of drc_indexes
+	 * returned.  Search through the list to find the matching
+	 * drc_index and get the core number
+	 */
+	for (i = 0; i < indexes[0]; i++) {
+		if (indexes[i + 1] == drc_index)
+			break;
+	}
+	/* Convert core number to logical cpu number */
+	cpu = cpu_first_thread_of_core(i);
+	rc = 0;
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
+	return cpu;
+}
+
+/*
+ * pseries hypervisor call H_BEST_ENERGY provides hints to OS on
+ * preferred logical cpus to activate or deactivate for optimized
+ * energy consumption.
+ */
+
+#define FLAGS_MODE1	0x004E200000080E01
+#define FLAGS_MODE2	0x004E200000080401
+#define FLAGS_ACTIVATE  0x100
+
+static ssize_t get_best_energy_list(char *page, int activate)
+{
+	int rc, cnt, i, cpu;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+	u32 *buf_page;
+	char *s = page;
+
+	buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
+	if (!buf_page)
+		return -ENOMEM;
+
+	flags = FLAGS_MODE1;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
+				0, 0, 0, 0, 0, 0);
+	if (rc != H_SUCCESS) {
+		free_page((unsigned long) buf_page);
+		return -EINVAL;
+	}
+
+	cnt = retbuf[0];
+	for (i = 0; i < cnt; i++) {
+		cpu = drc_index_to_cpu(buf_page[2*i+1]);
+		if ((cpu_online(cpu) && !activate) ||
+		    (!cpu_online(cpu) && activate))
+			s += sprintf(s, "%d,", cpu);
+	}
+	if (s > page) { /* Something to show */
+		s--; /* Suppress last comma */
+		s += sprintf(s, "\n");
+	}
+
+	free_page((unsigned long) buf_page);
+	return s-page;
+}
+
+static ssize_t get_best_energy_data(struct device *dev,
+					char *page, int activate)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+
+	flags = FLAGS_MODE2;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
+				cpu_to_drc_index(dev->id),
+				0, 0, 0, 0, 0, 0, 0);
+
+	if (rc != H_SUCCESS)
+		return -EINVAL;
+
+	return sprintf(page, "%lu\n", retbuf[1] >> 32);
+}
+
+/* Wrapper functions */
+
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 1);
+}
+
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 0);
+}
+
+static ssize_t percpu_activate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 1);
+}
+
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 0);
+}
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/system/cpu/pseries_activate_hint_list
+ * /sys/devices/system/cpu/pseries_deactivate_hint_list
+ *	Comma separated list of cpus to activate or deactivate
+ * /sys/devices/system/cpu/cpuN/pseries_activate_hint
+ * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
+ *	Per-cpu value of the hint
+ */
+
+struct device_attribute attr_cpu_activate_hint_list =
+		__ATTR(pseries_activate_hint_list, 0444,
+		cpu_activate_hint_list_show, NULL);
+
+struct device_attribute attr_cpu_deactivate_hint_list =
+		__ATTR(pseries_deactivate_hint_list, 0444,
+		cpu_deactivate_hint_list_show, NULL);
+
+struct device_attribute attr_percpu_activate_hint =
+		__ATTR(pseries_activate_hint, 0444,
+		percpu_activate_hint_show, NULL);
+
+struct device_attribute attr_percpu_deactivate_hint =
+		__ATTR(pseries_deactivate_hint, 0444,
+		percpu_deactivate_hint_show, NULL);
+
+static int __init pseries_energy_init(void)
+{
+	int cpu, err;
+	struct device *cpu_dev;
+
+	if (!check_for_h_best_energy()) {
+		printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
+		return 0;
+	}
+	/* Create the sysfs files */
+	err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_activate_hint_list);
+	if (!err)
+		err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_deactivate_hint_list);
+
+	if (err)
+		return err;
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_activate_hint);
+		if (err)
+			break;
+		err = device_create_file(cpu_dev,
+				&attr_percpu_deactivate_hint);
+		if (err)
+			break;
+	}
+
+	if (err)
+		return err;
+
+	sysfs_entries = 1; /* Removed entries on cleanup */
+	return 0;
+
+}
+
+static void __exit pseries_energy_cleanup(void)
+{
+	int cpu;
+	struct device *cpu_dev;
+
+	if (!sysfs_entries)
+		return;
+
+	/* Remove the sysfs files */
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
+
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_activate_hint.attr);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_deactivate_hint.attr);
+	}
+}
+
+module_init(pseries_energy_init);
+module_exit(pseries_energy_cleanup);
+MODULE_DESCRIPTION("Driver for pSeries platform energy management");
+MODULE_AUTHOR("Vaidyanathan Srinivasan");
+MODULE_LICENSE("GPL");
@@ -0,0 +1,408 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/of.h>
+#include <linux/fs.h>
+#include <linux/reboot.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+
+#include "pseries.h"
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_PER_CPU(__u64, mce_data_buf);
+
+static int ras_check_exception_token;
+
+#define EPOW_SENSOR_TOKEN	9
+#define EPOW_SENSOR_INDEX	0
+
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
+
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+	struct device_node *np;
+
+	ras_check_exception_token = rtas_token("check-exception");
+
+	/* Internal Errors */
+	np = of_find_node_by_path("/event-sources/internal-errors");
+	if (np != NULL) {
+		request_event_sources_irqs(np, ras_error_interrupt,
+					   "RAS_ERROR");
+		of_node_put(np);
+	}
+
+	/* EPOW Events */
+	np = of_find_node_by_path("/event-sources/epow-events");
+	if (np != NULL) {
+		request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
+		of_node_put(np);
+	}
+
+	return 0;
+}
+subsys_initcall(init_ras_IRQ);
+
+#define EPOW_SHUTDOWN_NORMAL				1
+#define EPOW_SHUTDOWN_ON_UPS				2
+#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS	3
+#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH	4
+
+static void handle_system_shutdown(char event_modifier)
+{
+	switch (event_modifier) {
+	case EPOW_SHUTDOWN_NORMAL:
+		pr_emerg("Firmware initiated power off");
+		orderly_poweroff(1);
+		break;
+
+	case EPOW_SHUTDOWN_ON_UPS:
+		pr_emerg("Loss of power reported by firmware, system is "
+			"running on UPS/battery");
+		break;
+
+	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
+		pr_emerg("Loss of system critical functions reported by "
+			"firmware");
+		pr_emerg("Check RTAS error log for details");
+		orderly_poweroff(1);
+		break;
+
+	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
+		pr_emerg("Ambient temperature too high reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		orderly_poweroff(1);
+		break;
+
+	default:
+		pr_err("Unknown power/cooling shutdown event (modifier %d)",
+			event_modifier);
+	}
+}
+
+struct epow_errorlog {
+	unsigned char sensor_value;
+	unsigned char event_modifier;
+	unsigned char extended_modifier;
+	unsigned char reserved;
+	unsigned char platform_reason;
+};
+
+#define EPOW_RESET			0
+#define EPOW_WARN_COOLING		1
+#define EPOW_WARN_POWER			2
+#define EPOW_SYSTEM_SHUTDOWN		3
+#define EPOW_SYSTEM_HALT		4
+#define EPOW_MAIN_ENCLOSURE		5
+#define EPOW_POWER_OFF			7
+
+void rtas_parse_epow_errlog(struct rtas_error_log *log)
+{
+	struct pseries_errorlog *pseries_log;
+	struct epow_errorlog *epow_log;
+	char action_code;
+	char modifier;
+
+	pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
+	if (pseries_log == NULL)
+		return;
+
+	epow_log = (struct epow_errorlog *)pseries_log->data;
+	action_code = epow_log->sensor_value & 0xF;	/* bottom 4 bits */
+	modifier = epow_log->event_modifier & 0xF;	/* bottom 4 bits */
+
+	switch (action_code) {
+	case EPOW_RESET:
+		pr_err("Non critical power or cooling issue cleared");
+		break;
+
+	case EPOW_WARN_COOLING:
+		pr_err("Non critical cooling issue reported by firmware");
+		pr_err("Check RTAS error log for details");
+		break;
+
+	case EPOW_WARN_POWER:
+		pr_err("Non critical power issue reported by firmware");
+		pr_err("Check RTAS error log for details");
+		break;
+
+	case EPOW_SYSTEM_SHUTDOWN:
+		handle_system_shutdown(epow_log->event_modifier);
+		break;
+
+	case EPOW_SYSTEM_HALT:
+		pr_emerg("Firmware initiated power off");
+		orderly_poweroff(1);
+		break;
+
+	case EPOW_MAIN_ENCLOSURE:
+	case EPOW_POWER_OFF:
+		pr_emerg("Critical power/cooling issue reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Immediate power off");
+		emergency_sync();
+		kernel_power_off();
+		break;
+
+	default:
+		pr_err("Unknown power/cooling event (action code %d)",
+			action_code);
+	}
+}
+
+/* Handle environmental and power warning (EPOW) interrupts. */
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
+{
+	int status;
+	int state;
+	int critical;
+
+	status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
+
+	if (state > 3)
+		critical = 1;		/* Time Critical */
+	else
+		critical = 0;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_EPOW_WARNING,
+			   critical, __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+	rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception.  If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
+{
+	struct rtas_error_log *rtas_elog;
+	int status;
+	int fatal;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
+			   __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+	if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+		fatal = 1;
+	else
+		fatal = 0;
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+	if (fatal) {
+		pr_emerg("Fatal hardware error reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Immediate power off");
+		emergency_sync();
+		kernel_power_off();
+	} else {
+		pr_err("Recoverable hardware error reported by firmware");
+	}
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+	((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
+	(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
+
+/*
+ * Get the error information for errors coming through the
+ * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * If the RTAS error is not of the extended type, then we put it in a per
+ * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
+ *
+ * The global_mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log.  This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+	unsigned long *savep;
+	struct rtas_error_log *h, *errhdr = NULL;
+
+	if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+		printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+		return NULL;
+	}
+
+	savep = __va(regs->gpr[3]);
+	regs->gpr[3] = savep[0];	/* restore original r3 */
+
+	/* If it isn't an extended log we can use the per cpu 64bit buffer */
+	h = (struct rtas_error_log *)&savep[1];
+	if (!h->extended) {
+		memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
+		errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
+	} else {
+		int len;
+
+		len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX);
+		memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+		memcpy(global_mce_data_buf, h, len);
+		errhdr = (struct rtas_error_log *)global_mce_data_buf;
+	}
+
+	return errhdr;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+	if (ret != 0)
+		printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+int pSeries_system_reset_exception(struct pt_regs *regs)
+{
+	if (fwnmi_active) {
+		struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+		if (errhdr) {
+			/* XXX Should look at FWNMI information */
+		}
+		fwnmi_release_errinfo();
+	}
+	return 0; /* need to perform reset */
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+{
+	int recovered = 0;
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		recovered = 0;
+
+	} else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+		/* Platform corrected itself */
+		recovered = 1;
+
+	} else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		printk(KERN_ERR "MCE: limited recovery, system may "
+		       "be degraded\n");
+		recovered = 1;
+
+	} else if (user_mode(regs) && !is_global_init(current) &&
+		   err->severity == RTAS_SEVERITY_ERROR_SYNC) {
+
+		/*
+		 * If we received a synchronous error when in userspace
+		 * kill the task. Firmware may report details of the fail
+		 * asynchronously, so we can't rely on the target and type
+		 * fields being valid here.
+		 */
+		printk(KERN_ERR "MCE: uncorrectable error, killing task "
+		       "%s:%d\n", current->comm, current->pid);
+
+		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+		recovered = 1;
+	}
+
+	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
+
+	return recovered;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present.  If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+	struct rtas_error_log *errp;
+
+	if (fwnmi_active) {
+		errp = fwnmi_get_errinfo(regs);
+		fwnmi_release_errinfo();
+		if (errp && recover_mce(regs, errp))
+			return 1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,564 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mmu.h>
+
+
+
+/*
+ * Routines for "runtime" addition and removal of device tree nodes.
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+/*
+ * Add a node to /proc/device-tree.
+ */
+static void add_node_proc_entries(struct device_node *np)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(np, ent);
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	struct property *pp = np->properties;
+	struct device_node *parent = np->parent;
+
+	while (pp) {
+		remove_proc_entry(pp->name, np->pde);
+		pp = pp->next;
+	}
+	if (np->pde)
+		remove_proc_entry(np->pde->name, parent->pde);
+}
+#else /* !CONFIG_PROC_DEVICETREE */
+static void add_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/**
+ *	derive_parent - basically like dirname(1)
+ *	@path:  the full_name of a node to be added to the tree
+ *
+ *	Returns the node which should be the parent of the node
+ *	described by path.  E.g., for path = "/foo/bar", returns
+ *	the node with full_name = "/foo".
+ */
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent = NULL;
+	char *parent_path = "/";
+	size_t parent_path_len = strrchr(path, '/') - path + 1;
+
+	/* reject if path is "/" */
+	if (!strcmp(path, "/"))
+		return ERR_PTR(-EINVAL);
+
+	if (strrchr(path, '/') != path) {
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return ERR_PTR(-ENOMEM);
+		strlcpy(parent_path, path, parent_path_len);
+	}
+	parent = of_find_node_by_path(parent_path);
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+	if (strcmp(parent_path, "/"))
+		kfree(parent_path);
+	return parent;
+}
+
+static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
+
+int pSeries_reconfig_notifier_register(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
+}
+
+void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+}
+
+int pSeries_reconfig_notify(unsigned long action, void *p)
+{
+	int err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+						action, p);
+
+	return notifier_to_errno(err);
+}
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+	struct device_node *np;
+	int err = -ENOMEM;
+
+	np = kzalloc(sizeof(*np), GFP_KERNEL);
+	if (!np)
+		goto out_err;
+
+	np->full_name = kstrdup(path, GFP_KERNEL);
+	if (!np->full_name)
+		goto out_err;
+
+	np->properties = proplist;
+	of_node_set_flag(np, OF_DYNAMIC);
+	kref_init(&np->kref);
+
+	np->parent = derive_parent(path);
+	if (IS_ERR(np->parent)) {
+		err = PTR_ERR(np->parent);
+		goto out_err;
+	}
+
+	err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np);
+	if (err) {
+		printk(KERN_ERR "Failed to add device node %s\n", path);
+		goto out_err;
+	}
+
+	of_attach_node(np);
+
+	add_node_proc_entries(np);
+
+	of_node_put(np->parent);
+
+	return 0;
+
+out_err:
+	if (np) {
+		of_node_put(np->parent);
+		kfree(np->full_name);
+		kfree(np);
+	}
+	return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+	struct device_node *parent, *child;
+
+	parent = of_get_parent(np);
+	if (!parent)
+		return -EINVAL;
+
+	if ((child = of_get_next_child(np, NULL))) {
+		of_node_put(child);
+		of_node_put(parent);
+		return -EBUSY;
+	}
+
+	remove_node_proc_entries(np);
+
+	pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
+	of_detach_node(np);
+
+	of_node_put(parent);
+	of_node_put(np); /* Must decrement the refcount */
+	return 0;
+}
+
+/*
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
+ * OF device nodes.  Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+	struct property *next;
+	for (; prop; prop = next) {
+		next = prop->next;
+		kfree(prop->name);
+		kfree(prop->value);
+		kfree(prop);
+	}
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data.  Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+				  unsigned char **value)
+{
+	char *tmp;
+
+	*name = buf;
+
+	tmp = strchr(buf, ' ');
+	if (!tmp) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	*tmp = '\0';
+
+	if (++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the length */
+	*length = -1;
+	*length = simple_strtoul(tmp, &tmp, 10);
+	if (*length == -1) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	if (*tmp != ' ' || ++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the value */
+	*value = tmp;
+	tmp += *length;
+	if (tmp > end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	tmp++;
+
+	/* and now we should be on the next name, or the end */
+	return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+				     const unsigned char *value, struct property *last)
+{
+	struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
+
+	if (!new)
+		return NULL;
+
+	if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+		goto cleanup;
+	if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+		goto cleanup;
+
+	strcpy(new->name, name);
+	memcpy(new->value, value, length);
+	*(((char *)new->value) + length) = 0;
+	new->length = length;
+	new->next = last;
+	return new;
+
+cleanup:
+	kfree(new->name);
+	kfree(new->value);
+	kfree(new);
+	return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+	char *path, *end, *name;
+	struct device_node *np;
+	struct property *prop = NULL;
+	unsigned char* value;
+	int length, rv = 0;
+
+	end = buf + bufsize;
+	path = buf;
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return -EINVAL;
+	*buf = '\0';
+	buf++;
+
+	if ((np = of_find_node_by_path(path))) {
+		of_node_put(np);
+		return -EINVAL;
+	}
+
+	/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+	while (buf < end &&
+	       (buf = parse_next_property(buf, end, &name, &length, &value))) {
+		struct property *last = prop;
+
+		prop = new_property(name, length, value, last);
+		if (!prop) {
+			rv = -ENOMEM;
+			prop = last;
+			goto out;
+		}
+	}
+	if (!buf) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+	if (rv)
+		release_prop_list(prop);
+	return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+	struct device_node *node;
+	int rv = -ENODEV;
+
+	if ((node = of_find_node_by_path(buf)))
+		rv = pSeries_reconfig_remove_node(node);
+
+	of_node_put(node);
+	return rv;
+}
+
+static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
+{
+	char *handle_str;
+	phandle handle;
+	*npp = NULL;
+
+	handle_str = buf;
+
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return NULL;
+	*buf = '\0';
+	buf++;
+
+	handle = simple_strtoul(handle_str, NULL, 0);
+
+	*npp = of_find_node_by_phandle(handle);
+	return buf;
+}
+
+static int do_add_property(char *buf, size_t bufsize)
+{
+	struct property *prop = NULL;
+	struct device_node *np;
+	unsigned char *value;
+	char *name, *end;
+	int length;
+	end = buf + bufsize;
+	buf = parse_node(buf, bufsize, &np);
+
+	if (!np)
+		return -ENODEV;
+
+	if (parse_next_property(buf, end, &name, &length, &value) == NULL)
+		return -EINVAL;
+
+	prop = new_property(name, length, value, NULL);
+	if (!prop)
+		return -ENOMEM;
+
+	prom_add_property(np, prop);
+
+	return 0;
+}
+
+static int do_remove_property(char *buf, size_t bufsize)
+{
+	struct device_node *np;
+	char *tmp;
+	struct property *prop;
+	buf = parse_node(buf, bufsize, &np);
+
+	if (!np)
+		return -ENODEV;
+
+	tmp = strchr(buf,' ');
+	if (tmp)
+		*tmp = '\0';
+
+	if (strlen(buf) == 0)
+		return -EINVAL;
+
+	prop = of_find_property(np, buf, NULL);
+
+	return prom_remove_property(np, prop);
+}
+
+static int do_update_property(char *buf, size_t bufsize)
+{
+	struct device_node *np;
+	unsigned char *value;
+	char *name, *end, *next_prop;
+	int rc, length;
+	struct property *newprop, *oldprop;
+	buf = parse_node(buf, bufsize, &np);
+	end = buf + bufsize;
+
+	if (!np)
+		return -ENODEV;
+
+	next_prop = parse_next_property(buf, end, &name, &length, &value);
+	if (!next_prop)
+		return -EINVAL;
+
+	newprop = new_property(name, length, value, NULL);
+	if (!newprop)
+		return -ENOMEM;
+
+	if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
+		slb_set_size(*(int *)value);
+
+	oldprop = of_find_property(np, name,NULL);
+	if (!oldprop) {
+		if (strlen(name))
+			return prom_add_property(np, newprop);
+		return -ENODEV;
+	}
+
+	rc = prom_update_property(np, newprop, oldprop);
+	if (rc)
+		return rc;
+
+	/* For memory under the ibm,dynamic-reconfiguration-memory node
+	 * of the device tree, adding and removing memory is just an update
+	 * to the ibm,dynamic-memory property instead of adding/removing a
+	 * memory node in the device tree.  For these cases we still need to
+	 * involve the notifier chain.
+	 */
+	if (!strcmp(name, "ibm,dynamic-memory")) {
+		int action;
+
+		next_prop = parse_next_property(next_prop, end, &name,
+						&length, &value);
+		if (!next_prop)
+			return -EINVAL;
+
+		if (!strcmp(name, "add"))
+			action = PSERIES_DRCONF_MEM_ADD;
+		else
+			action = PSERIES_DRCONF_MEM_REMOVE;
+
+		rc = pSeries_reconfig_notify(action, value);
+		if (rc) {
+			prom_update_property(np, oldprop, newprop);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties.  Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+			  loff_t *off)
+{
+	int rv = 0;
+	char *kbuf;
+	char *tmp;
+
+	if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
+		rv = -ENOMEM;
+		goto out;
+	}
+	if (copy_from_user(kbuf, buf, count)) {
+		rv = -EFAULT;
+		goto out;
+	}
+
+	kbuf[count] = '\0';
+
+	tmp = strchr(kbuf, ' ');
+	if (!tmp) {
+		rv = -EINVAL;
+		goto out;
+	}
+	*tmp = '\0';
+	tmp++;
+
+	if (!strcmp(kbuf, "add_node"))
+		rv = do_add_node(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_node"))
+		rv = do_remove_node(tmp);
+	else if (!strcmp(kbuf, "add_property"))
+		rv = do_add_property(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_property"))
+		rv = do_remove_property(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "update_property"))
+		rv = do_update_property(tmp, count - (tmp - kbuf));
+	else
+		rv = -EINVAL;
+out:
+	kfree(kbuf);
+	return rv ? rv : count;
+}
+
+static const struct file_operations ofdt_fops = {
+	.write = ofdt_write,
+	.llseek = noop_llseek,
+};
+
+/* create /proc/powerpc/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+	struct proc_dir_entry *ent;
+
+	if (!machine_is(pseries))
+		return 0;
+
+	ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
+	if (ent)
+		ent->size = 0;
+
+	return 0;
+}
+__initcall(proc_ppc64_create_ofdt);
@@ -0,0 +1,214 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64  Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system.  After a reboot the operating system can access a dump
+ * of this data using this driver.  A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+
+static unsigned int ibm_scan_log_dump;			/* RTAS token */
+static struct proc_dir_entry *proc_ppc64_scan_log_dump;	/* The proc file */
+
+static ssize_t scanlog_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+        struct inode * inode = file->f_path.dentry->d_inode;
+	struct proc_dir_entry *dp;
+	unsigned int *data;
+	int status;
+	unsigned long len, off;
+	unsigned int wait_time;
+
+        dp = PDE(inode);
+ 	data = (unsigned int *)dp->data;
+
+	if (count > RTAS_DATA_BUF_SIZE)
+		count = RTAS_DATA_BUF_SIZE;
+
+	if (count < 1024) {
+		/* This is the min supported by this RTAS call.  Rather
+		 * than do all the buffering we insist the user code handle
+		 * larger reads.  As long as cp works... :)
+		 */
+		printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+		return -EINVAL;
+	}
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	for (;;) {
+		wait_time = 500;	/* default wait if no data */
+		spin_lock(&rtas_data_buf_lock);
+		memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+		status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+				   (u32) __pa(rtas_data_buf), (u32) count);
+		memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+		spin_unlock(&rtas_data_buf_lock);
+
+		pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
+			 "data[2]=%x\n", status, data[0], data[1], data[2]);
+		switch (status) {
+		    case SCANLOG_COMPLETE:
+			pr_debug("scanlog: hit eof\n");
+			return 0;
+		    case SCANLOG_HWERROR:
+			pr_debug("scanlog: hardware error reading data\n");
+			return -EIO;
+		    case SCANLOG_CONTINUE:
+			/* We may or may not have data yet */
+			len = data[1];
+			off = data[2];
+			if (len > 0) {
+				if (copy_to_user(buf, ((char *)data)+off, len))
+					return -EFAULT;
+				return len;
+			}
+			/* Break to sleep default time */
+			break;
+		    default:
+			/* Assume extended busy */
+			wait_time = rtas_busy_delay_time(status);
+			if (!wait_time) {
+				printk(KERN_ERR "scanlog: unknown error " \
+				       "from rtas: %d\n", status);
+				return -EIO;
+			}
+		}
+		/* Apparently no data yet.  Wait and try again. */
+		msleep_interruptible(wait_time);
+	}
+	/*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char __user * buf,
+			     size_t count, loff_t *ppos)
+{
+	char stkbuf[20];
+	int status;
+
+	if (count > 19) count = 19;
+	if (copy_from_user (stkbuf, buf, count)) {
+		return -EFAULT;
+	}
+	stkbuf[count] = 0;
+
+	if (buf) {
+		if (strncmp(stkbuf, "reset", 5) == 0) {
+			pr_debug("scanlog: reset scanlog\n");
+			status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+			pr_debug("scanlog: rtas returns %d\n", status);
+		}
+	}
+	return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	unsigned int *data = (unsigned int *)dp->data;
+
+	if (data[0] != 0) {
+		/* This imperfect test stops a second copy of the
+		 * data (or a reset while data is being copied)
+		 */
+		return -EBUSY;
+	}
+
+	data[0] = 0;	/* re-init so we restart the scan */
+
+	return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	unsigned int *data = (unsigned int *)dp->data;
+
+	data[0] = 0;
+
+	return 0;
+}
+
+const struct file_operations scanlog_fops = {
+	.owner		= THIS_MODULE,
+	.read		= scanlog_read,
+	.write		= scanlog_write,
+	.open		= scanlog_open,
+	.release	= scanlog_release,
+	.llseek		= noop_llseek,
+};
+
+static int __init scanlog_init(void)
+{
+	struct proc_dir_entry *ent;
+	void *data;
+	int err = -ENOMEM;
+
+	ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+	if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	/* Ideally we could allocate a buffer < 4G */
+	data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!data)
+		goto err;
+
+	ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
+			       &scanlog_fops, data);
+	if (!ent)
+		goto err;
+
+	proc_ppc64_scan_log_dump = ent;
+
+	return 0;
+err:
+	kfree(data);
+	return err;
+}
+
+static void __exit scanlog_cleanup(void)
+{
+	if (proc_ppc64_scan_log_dump) {
+		kfree(proc_ppc64_scan_log_dump->data);
+		remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
+	}
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
@@ -0,0 +1,660 @@
+/*
+ *  64-bit pSeries and RS/6000 setup code.
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Adapted from 'alpha' version by Gary Thomas
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/user.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/cpuidle.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/pmc.h>
+#include <asm/mpic.h>
+#include <asm/xics.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+#include <asm/pSeries_reconfig.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+int CMO_PrPSP = -1;
+int CMO_SecPSP = -1;
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT);
+EXPORT_SYMBOL(CMO_PageSize);
+
+int fwnmi_active;  /* TRUE if an FWNMI handler is present */
+
+static struct device_node *pSeries_mpic_node;
+
+static void pSeries_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = of_get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: CHRP %s\n", model);
+	of_node_put(root);
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ */
+static void __init fwnmi_init(void)
+{
+	unsigned long system_reset_addr, machine_check_addr;
+
+	int ibm_nmi_register = rtas_token("ibm,nmi-register");
+	if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+		return;
+
+	/* If the kernel's not linked at zero we point the firmware at low
+	 * addresses anyway, and use a trampoline to get to the real code. */
+	system_reset_addr  = __pa(system_reset_fwnmi) - PHYSICAL_START;
+	machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
+
+	if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
+				machine_check_addr))
+		fwnmi_active = 1;
+}
+
+static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	unsigned int cascade_irq = i8259_irq();
+
+	if (cascade_irq != NO_IRQ)
+		generic_handle_irq(cascade_irq);
+
+	chip->irq_eoi(&desc->irq_data);
+}
+
+static void __init pseries_setup_i8259_cascade(void)
+{
+	struct device_node *np, *old, *found = NULL;
+	unsigned int cascade;
+	const u32 *addrp;
+	unsigned long intack = 0;
+	int naddr;
+
+	for_each_node_by_type(np, "interrupt-controller") {
+		if (of_device_is_compatible(np, "chrp,iic")) {
+			found = np;
+			break;
+		}
+	}
+
+	if (found == NULL) {
+		printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
+		return;
+	}
+
+	cascade = irq_of_parse_and_map(found, 0);
+	if (cascade == NO_IRQ) {
+		printk(KERN_ERR "pic: failed to map cascade interrupt");
+		return;
+	}
+	pr_debug("pic: cascade mapped to irq %d\n", cascade);
+
+	for (old = of_node_get(found); old != NULL ; old = np) {
+		np = of_get_parent(old);
+		of_node_put(old);
+		if (np == NULL)
+			break;
+		if (strcmp(np->name, "pci") != 0)
+			continue;
+		addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
+		if (addrp == NULL)
+			continue;
+		naddr = of_n_addr_cells(np);
+		intack = addrp[naddr-1];
+		if (naddr > 1)
+			intack |= ((unsigned long)addrp[naddr-2]) << 32;
+	}
+	if (intack)
+		printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
+	i8259_init(found, intack);
+	of_node_put(found);
+	irq_set_chained_handler(cascade, pseries_8259_cascade);
+}
+
+static void __init pseries_mpic_init_IRQ(void)
+{
+	struct device_node *np;
+	const unsigned int *opprop;
+	unsigned long openpic_addr = 0;
+	int naddr, n, i, opplen;
+	struct mpic *mpic;
+
+	np = of_find_node_by_path("/");
+	naddr = of_n_addr_cells(np);
+	opprop = of_get_property(np, "platform-open-pic", &opplen);
+	if (opprop != 0) {
+		openpic_addr = of_read_number(opprop, naddr);
+		printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
+	}
+	of_node_put(np);
+
+	BUG_ON(openpic_addr == 0);
+
+	/* Setup the openpic driver */
+	mpic = mpic_alloc(pSeries_mpic_node, openpic_addr,
+			MPIC_NO_RESET, 16, 0, " MPIC     ");
+	BUG_ON(mpic == NULL);
+
+	/* Add ISUs */
+	opplen /= sizeof(u32);
+	for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
+		unsigned long isuaddr = of_read_number(opprop + i, naddr);
+		mpic_assign_isu(mpic, n, isuaddr);
+	}
+
+	/* Setup top-level get_irq */
+	ppc_md.get_irq = mpic_get_irq;
+
+	/* All ISUs are setup, complete initialization */
+	mpic_init(mpic);
+
+	/* Look for cascade */
+	pseries_setup_i8259_cascade();
+}
+
+static void __init pseries_xics_init_IRQ(void)
+{
+	xics_init();
+	pseries_setup_i8259_cascade();
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+	unsigned long set, reset;
+
+	set = 1UL << 63;
+	reset = 0;
+	plpar_hcall_norets(H_PERFMON, set, reset);
+}
+
+static void __init pseries_discover_pic(void)
+{
+	struct device_node *np;
+	const char *typep;
+
+	for (np = NULL; (np = of_find_node_by_name(np,
+						   "interrupt-controller"));) {
+		typep = of_get_property(np, "compatible", NULL);
+		if (strstr(typep, "open-pic")) {
+			pSeries_mpic_node = of_node_get(np);
+			ppc_md.init_IRQ       = pseries_mpic_init_IRQ;
+			setup_kexec_cpu_down_mpic();
+			smp_init_pseries_mpic();
+			return;
+		} else if (strstr(typep, "ppc-xicp")) {
+			ppc_md.init_IRQ       = pseries_xics_init_IRQ;
+			setup_kexec_cpu_down_xics();
+			smp_init_pseries_xics();
+			return;
+		}
+	}
+	printk(KERN_ERR "pSeries_discover_pic: failed to recognize"
+	       " interrupt-controller\n");
+}
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	struct device_node *np = node;
+	struct pci_dn *pci = NULL;
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		pci = np->parent->data;
+		if (pci) {
+			update_dn_pci_info(np, pci->phb);
+
+			/* Create EEH device for the OF node */
+			eeh_dev_init(np, pci->phb);
+		}
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+	.notifier_call = pci_dn_reconfig_notifier,
+};
+
+struct kmem_cache *dtl_cache;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/*
+ * Allocate space for the dispatch trace log for all possible cpus
+ * and register the buffers with the hypervisor.  This is used for
+ * computing time stolen by the hypervisor.
+ */
+static int alloc_dispatch_logs(void)
+{
+	int cpu, ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return 0;
+
+	if (!dtl_cache)
+		return 0;
+
+	for_each_possible_cpu(cpu) {
+		pp = &paca[cpu];
+		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+		if (!dtl) {
+			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+				cpu);
+			pr_warn("Stolen time statistics will be unreliable\n");
+			break;
+		}
+
+		pp->dtl_ridx = 0;
+		pp->dispatch_log = dtl;
+		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+		pp->dtl_curr = dtl;
+	}
+
+	/* Register the DTL for the current (boot) cpu */
+	dtl = get_paca()->dispatch_log;
+	get_paca()->dtl_ridx = 0;
+	get_paca()->dtl_curr = dtl;
+	get_paca()->lppaca_ptr->dtl_idx = 0;
+
+	/* hypervisor reads buffer length from this field */
+	dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
+	ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
+	if (ret)
+		pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+		       "with %d\n", smp_processor_id(),
+		       hard_smp_processor_id(), ret);
+	get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+
+	return 0;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+static inline int alloc_dispatch_logs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int alloc_dispatch_log_kmem_cache(void)
+{
+	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
+						DISPATCH_LOG_BYTES, 0, NULL);
+	if (!dtl_cache) {
+		pr_warn("Failed to create dispatch trace log buffer cache\n");
+		pr_warn("Stolen time statistics will be unreliable\n");
+		return 0;
+	}
+
+	return alloc_dispatch_logs();
+}
+early_initcall(alloc_dispatch_log_kmem_cache);
+
+static void pSeries_idle(void)
+{
+	/* This would call on the cpuidle framework, and the back-end pseries
+	 * driver to  go to idle states
+	 */
+	if (cpuidle_idle_call()) {
+		/* On error, execute default handler
+		 * to go into low thread priority and possibly
+		 * low power mode.
+		 */
+		HMT_low();
+		HMT_very_low();
+	}
+}
+
+static void __init pSeries_setup_arch(void)
+{
+	panic_timeout = 10;
+
+	/* Discover PIC type and setup ppc_md accordingly */
+	pseries_discover_pic();
+
+	/* openpic global configuration register (64-bit format). */
+	/* openpic Interrupt Source Unit pointer (64-bit format). */
+	/* python0 facility area (mmio) (64-bit format) REAL address. */
+
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	fwnmi_init();
+
+	/* By default, only probe PCI (can be overriden by rtas_pci) */
+	pci_add_flags(PCI_PROBE_ONLY);
+
+	/* Find and initialize PCI host bridges */
+	init_pci_config_tokens();
+	eeh_pseries_init();
+	find_and_init_phbs();
+	pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
+	eeh_init();
+
+	pSeries_nvram_init();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		vpa_init(boot_cpuid);
+		ppc_md.power_save = pSeries_idle;
+	}
+
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+	else
+		ppc_md.enable_pmcs = power4_enable_pmcs;
+}
+
+static int __init pSeries_init_panel(void)
+{
+	/* Manually leave the kernel version on the panel. */
+	ppc_md.progress("Linux ppc64\n", 0);
+	ppc_md.progress(init_utsname()->version, 0);
+
+	return 0;
+}
+machine_arch_initcall(pseries, pSeries_init_panel);
+
+static int pseries_set_dabr(unsigned long dabr)
+{
+	return plpar_hcall_norets(H_SET_DABR, dabr);
+}
+
+static int pseries_set_xdabr(unsigned long dabr)
+{
+	/* We want to catch accesses from kernel and userspace */
+	return plpar_hcall_norets(H_SET_XDABR, dabr,
+			H_DABRX_KERNEL | H_DABRX_USER);
+}
+
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+void pSeries_coalesce_init(void)
+{
+	struct hvcall_mpp_x_data mpp_x_data;
+
+	if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
+		powerpc_firmware_features |= FW_FEATURE_XCMO;
+	else
+		powerpc_firmware_features &= ~FW_FEATURE_XCMO;
+}
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+void pSeries_cmo_feature_init(void)
+{
+	char *ptr, *key, *value, *end;
+	int call_status;
+	int page_order = IOMMU_PAGE_SHIFT;
+
+	pr_debug(" -> fw_cmo_feature_init()\n");
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				CMO_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+
+	if (call_status != 0) {
+		spin_unlock(&rtas_data_buf_lock);
+		pr_debug("CMO not available\n");
+		pr_debug(" <- fw_cmo_feature_init()\n");
+		return;
+	}
+
+	end = rtas_data_buf + CMO_MAXLENGTH - 2;
+	ptr = rtas_data_buf + 2;	/* step over strlen value */
+	key = value = ptr;
+
+	while (*ptr && (ptr <= end)) {
+		/* Separate the key and value by replacing '=' with '\0' and
+		 * point the value at the string after the '='
+		 */
+		if (ptr[0] == '=') {
+			ptr[0] = '\0';
+			value = ptr + 1;
+		} else if (ptr[0] == '\0' || ptr[0] == ',') {
+			/* Terminate the string containing the key/value pair */
+			ptr[0] = '\0';
+
+			if (key == value) {
+				pr_debug("Malformed key/value pair\n");
+				/* Never found a '=', end processing */
+				break;
+			}
+
+			if (0 == strcmp(key, "CMOPageSize"))
+				page_order = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "PrPSP"))
+				CMO_PrPSP = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "SecPSP"))
+				CMO_SecPSP = simple_strtol(value, NULL, 10);
+			value = key = ptr + 1;
+		}
+		ptr++;
+	}
+
+	/* Page size is returned as the power of 2 of the page size,
+	 * convert to the page size in bytes before returning
+	 */
+	CMO_PageSize = 1 << page_order;
+	pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
+
+	if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
+		pr_info("CMO enabled\n");
+		pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+		         CMO_SecPSP);
+		powerpc_firmware_features |= FW_FEATURE_CMO;
+		pSeries_coalesce_init();
+	} else
+		pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+		         CMO_SecPSP);
+	spin_unlock(&rtas_data_buf_lock);
+	pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
+/*
+ * Early initialization.  Relocation is on but do not reference unbolted pages
+ */
+static void __init pSeries_init_early(void)
+{
+	pr_debug(" -> pSeries_init_early()\n");
+
+#ifdef CONFIG_HVC_CONSOLE
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		hvc_vio_init_early();
+#endif
+	if (firmware_has_feature(FW_FEATURE_DABR))
+		ppc_md.set_dabr = pseries_set_dabr;
+	else if (firmware_has_feature(FW_FEATURE_XDABR))
+		ppc_md.set_dabr = pseries_set_xdabr;
+
+	pSeries_cmo_feature_init();
+	iommu_init_early_pSeries();
+
+	pr_debug(" <- pSeries_init_early()\n");
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+
+static int __init pSeries_probe_hypertas(unsigned long node,
+					 const char *uname, int depth,
+					 void *data)
+{
+	const char *hypertas;
+	unsigned long len;
+
+	if (depth != 1 ||
+	    (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0))
+		return 0;
+
+	hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len);
+	if (!hypertas)
+		return 1;
+
+	powerpc_firmware_features |= FW_FEATURE_LPAR;
+	fw_feature_init(hypertas, len);
+
+	return 1;
+}
+
+static int __init pSeries_probe(void)
+{
+	unsigned long root = of_get_flat_dt_root();
+ 	char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
+
+ 	if (dtype == NULL)
+ 		return 0;
+ 	if (strcmp(dtype, "chrp"))
+		return 0;
+
+	/* Cell blades firmware claims to be chrp while it's not. Until this
+	 * is fixed, we need to avoid those here.
+	 */
+	if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") ||
+	    of_flat_dt_is_compatible(root, "IBM,CBEA"))
+		return 0;
+
+	pr_debug("pSeries detected, looking for LPAR capability...\n");
+
+	/* Now try to figure out if we are running on LPAR */
+	of_scan_flat_dt(pSeries_probe_hypertas, NULL);
+
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		hpte_init_lpar();
+	else
+		hpte_init_native();
+
+	pr_debug("Machine is%s LPAR !\n",
+	         (powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
+
+	return 1;
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		return PCI_PROBE_DEVTREE;
+	return PCI_PROBE_NORMAL;
+}
+
+/**
+ * pSeries_power_off - tell firmware about how to power off the system.
+ *
+ * This function calls either the power-off rtas token in normal cases
+ * or the ibm,power-off-ups token (if present & requested) in case of
+ * a power failure. If power-off token is used, power on will only be
+ * possible with power button press. If ibm,power-off-ups token is used
+ * it will allow auto poweron after power is restored.
+ */
+static void pSeries_power_off(void)
+{
+	int rc;
+	int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
+
+	if (rtas_flash_term_hook)
+		rtas_flash_term_hook(SYS_POWER_OFF);
+
+	if (rtas_poweron_auto == 0 ||
+		rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
+		rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
+		printk(KERN_INFO "RTAS power-off returned %d\n", rc);
+	} else {
+		rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
+		printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
+	}
+	for (;;);
+}
+
+#ifndef CONFIG_PCI
+void pSeries_final_fixup(void) { }
+#endif
+
+define_machine(pseries) {
+	.name			= "pSeries",
+	.probe			= pSeries_probe,
+	.setup_arch		= pSeries_setup_arch,
+	.init_early		= pSeries_init_early,
+	.show_cpuinfo		= pSeries_show_cpuinfo,
+	.log_error		= pSeries_log_error,
+	.pcibios_fixup		= pSeries_final_fixup,
+	.pci_probe_mode		= pSeries_pci_probe_mode,
+	.restart		= rtas_restart,
+	.power_off		= pSeries_power_off,
+	.halt			= rtas_halt,
+	.panic			= rtas_os_term,
+	.get_boot_time		= rtas_get_boot_time,
+	.get_rtc_time		= rtas_get_rtc_time,
+	.set_rtc_time		= rtas_set_rtc_time,
+	.calibrate_decr		= generic_calibrate_decr,
+	.progress		= rtas_progress,
+	.system_reset_exception = pSeries_system_reset_exception,
+	.machine_check_exception = pSeries_machine_check_exception,
+};
@@ -0,0 +1,258 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mpic.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+#include "offline_states.h"
+
+
+/*
+ * The Primary thread of each non-boot processor was started from the OF client
+ * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
+ */
+static cpumask_var_t of_spin_mask;
+
+/* Query where a cpu is now.  Return codes #defined in plpar_wrappers.h */
+int smp_query_cpu_stopped(unsigned int pcpu)
+{
+	int cpu_status, status;
+	int qcss_tok = rtas_token("query-cpu-stopped-state");
+
+	if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
+		printk_once(KERN_INFO
+			"Firmware doesn't support query-cpu-stopped-state\n");
+		return QCSS_HARDWARE_ERROR;
+	}
+
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware.  For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ *	0	- failure
+ *	1	- success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+	int status;
+	unsigned long start_here = __pa((u32)*((unsigned long *)
+					       generic_secondary_smp_init));
+	unsigned int pcpu;
+	int start_cpu;
+
+	if (cpumask_test_cpu(lcpu, of_spin_mask))
+		/* Already started by OF and sitting in spin loop */
+		return 1;
+
+	pcpu = get_hard_smp_processor_id(lcpu);
+
+	/* Check to see if the CPU out of FW already for kexec */
+	if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
+		cpumask_set_cpu(lcpu, of_spin_mask);
+		return 1;
+	}
+
+	/* Fixup atomic count: it exited inside IRQ handler. */
+	task_thread_info(paca[lcpu].__current)->preempt_count	= 0;
+#ifdef CONFIG_HOTPLUG_CPU
+	if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+		goto out;
+#endif
+	/* 
+	 * If the RTAS start-cpu token does not exist then presume the
+	 * cpu is already spinning.
+	 */
+	start_cpu = rtas_token("start-cpu");
+	if (start_cpu == RTAS_UNKNOWN_SERVICE)
+		return 1;
+
+	status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR "start-cpu failed: %i\n", status);
+		return 0;
+	}
+
+#ifdef CONFIG_HOTPLUG_CPU
+out:
+#endif
+	return 1;
+}
+
+static void __devinit smp_xics_setup_cpu(int cpu)
+{
+	if (cpu != boot_cpuid)
+		xics_setup_cpu();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		vpa_init(cpu);
+
+	cpumask_clear_cpu(cpu, of_spin_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+	set_default_offline_state(cpu);
+#endif
+	pseries_notify_cpuidle_add_cpu(cpu);
+}
+
+static int __devinit smp_pSeries_kick_cpu(int nr)
+{
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	if (!smp_startup_cpu(nr))
+		return -ENOENT;
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	paca[nr].cpu_start = 1;
+#ifdef CONFIG_HOTPLUG_CPU
+	set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+	if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+		long rc;
+		unsigned long hcpuid;
+
+		hcpuid = get_hard_smp_processor_id(nr);
+		rc = plpar_hcall_norets(H_PROD, hcpuid);
+		if (rc != H_SUCCESS)
+			printk(KERN_ERR "Error: Prod to wake up processor %d "
+						"Ret= %ld\n", nr, rc);
+	}
+#endif
+
+	return 0;
+}
+
+static int smp_pSeries_cpu_bootable(unsigned int nr)
+{
+	/* Special case - we inhibit secondary thread startup
+	 * during boot if the user requests it.
+	 */
+	if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+		if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
+			return 0;
+		if (smt_enabled_at_boot
+		    && cpu_thread_in_core(nr) >= smt_enabled_at_boot)
+			return 0;
+	}
+
+	return 1;
+}
+
+static struct smp_ops_t pSeries_mpic_smp_ops = {
+	.message_pass	= smp_mpic_message_pass,
+	.probe		= smp_mpic_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_mpic_setup_cpu,
+};
+
+static struct smp_ops_t pSeries_xics_smp_ops = {
+	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */
+	.probe		= xics_smp_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_xics_setup_cpu,
+	.cpu_bootable	= smp_pSeries_cpu_bootable,
+};
+
+/* This is called very early */
+static void __init smp_init_pseries(void)
+{
+	int i;
+
+	pr_debug(" -> smp_init_pSeries()\n");
+
+	alloc_bootmem_cpumask_var(&of_spin_mask);
+
+	/* Mark threads which are still spinning in hold loops. */
+	if (cpu_has_feature(CPU_FTR_SMT)) {
+		for_each_present_cpu(i) { 
+			if (cpu_thread_in_core(i) == 0)
+				cpumask_set_cpu(i, of_spin_mask);
+		}
+	} else {
+		cpumask_copy(of_spin_mask, cpu_present_mask);
+	}
+
+	cpumask_clear_cpu(boot_cpuid, of_spin_mask);
+
+	/* Non-lpar has additional take/give timebase */
+	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+		smp_ops->give_timebase = rtas_give_timebase;
+		smp_ops->take_timebase = rtas_take_timebase;
+	}
+
+	pr_debug(" <- smp_init_pSeries()\n");
+}
+
+void __init smp_init_pseries_mpic(void)
+{
+	smp_ops = &pSeries_mpic_smp_ops;
+
+	smp_init_pseries();
+}
+
+void __init smp_init_pseries_xics(void)
+{
+	smp_ops = &pSeries_xics_smp_ops;
+
+	smp_init_pseries();
+}
@@ -0,0 +1,220 @@
+/*
+  * Copyright (C) 2010 Brian King IBM Corporation
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+  */
+
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <linux/stat.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+#include <asm/topology.h>
+
+static u64 stream_id;
+static struct device suspend_dev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+	long vasi_state, rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	/* Make sure the state is valid */
+	rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+	vasi_state = retbuf[0];
+
+	if (rc) {
+		pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+		return rc;
+	} else if (vasi_state == H_VASI_ENABLED) {
+		return -EAGAIN;
+	} else if (vasi_state != H_VASI_SUSPENDING) {
+		pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+		       vasi_state);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+	if (atomic_read(&suspending))
+		return rtas_suspend_cpu(&suspend_data);
+	return 0;
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+	int rc = rtas_suspend_last_cpu(&suspend_data);
+
+	atomic_set(&suspending, 0);
+	atomic_set(&suspend_data.done, 1);
+	return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+	atomic_set(&suspending, 1);
+	atomic_set(&suspend_data.working, 0);
+	atomic_set(&suspend_data.done, 0);
+	atomic_set(&suspend_data.error, 0);
+	suspend_data.complete = &suspend_work;
+	INIT_COMPLETION(suspend_work);
+	return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @dev:		subsys root device
+ * @attr:		device attribute struct
+ * @buf:		buffer
+ * @count:		buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * 	number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	stream_id = simple_strtoul(buf, NULL, 16);
+
+	do {
+		rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+		if (rc == -EAGAIN)
+			ssleep(1);
+	} while (rc == -EAGAIN);
+
+	if (!rc) {
+		stop_topology_update();
+		rc = pm_suspend(PM_SUSPEND_MEM);
+		start_topology_update();
+	}
+
+	stream_id = 0;
+
+	if (!rc)
+		rc = count;
+	return rc;
+}
+
+static DEVICE_ATTR(hibernate, S_IWUSR, NULL, store_hibernate);
+
+static struct bus_type suspend_subsys = {
+	.name = "power",
+	.dev_name = "power",
+};
+
+static const struct platform_suspend_ops pseries_suspend_ops = {
+	.valid		= suspend_valid_only_mem,
+	.begin		= pseries_suspend_begin,
+	.prepare_late	= pseries_prepare_late,
+	.enter		= pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct device *dev)
+{
+	int rc;
+
+	if ((rc = subsys_system_register(&suspend_subsys, NULL)))
+		return rc;
+
+	dev->id = 0;
+	dev->bus = &suspend_subsys;
+
+	if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
+		goto subsys_unregister;
+
+	return 0;
+
+subsys_unregister:
+	bus_unregister(&suspend_subsys);
+	return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+	int rc;
+
+	if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	suspend_data.token = rtas_token("ibm,suspend-me");
+	if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+		return 0;
+
+	if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
+		return rc;
+
+	ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+	suspend_set_ops(&pseries_suspend_ops);
+	return 0;
+}
+
+__initcall(pseries_suspend_init);